In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, quantile_transform
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

In [69]:
df = pd.read_csv('/Users/mikeredshaw/Documents/Schulich MBAN/Predictive Modelling | MBAN 5110 U /Hackathon/Updated_XYZ_Sportswear_Orders_Dataset.csv')

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss

# Convert date columns to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%d.%m.%Y')
df['Requested Delivery Date'] = pd.to_datetime(df['Requested Delivery Date'], format='%d.%m.%Y')

# Extract month from 'Order Date' as a feature for seasonality
df['Order Month'] = df['Order Date'].dt.month

# Features and target variable
features = ['Order Month', 'Customer Country Code', 'Route', 'Value', 'Items', 'Value per Item']
target = 'Product Code'

# Preprocessing: OneHotEncoder for categorical variables and StandardScaler for numerical variables
categorical_features = ['Order Month', 'Customer Country Code', 'Route']
numerical_features = ['Value', 'Items', 'Value per Item']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))
])

# Prepare data for the model
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_probs = model.predict_proba(X_test)

# Evaluate the model using log loss
log_loss_value = log_loss(y_test, y_pred_probs, labels=model.classes_)
print(f'Log Loss: {log_loss_value:.4f}')

# Evaluate accuracy
y_pred = model.predict(X_test)
accuracy_value = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_value:.4f}')



Log Loss: 1.4090
Accuracy: 0.2474


In [71]:
df.head()

Unnamed: 0,Order Date,Requested Delivery Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route,Value per Item,Order Month
0,2009-01-01,2009-02-18,DE,PK003,Economy Parka,VO,COD00001,269.87,EUR,8,East,33.73375,1
1,2009-01-01,2009-02-10,FR,PK001,Advanced Parka,VO,COD00002,170.34,EUR,2,West,85.17,1
2,2009-01-01,2009-01-17,ES,PK002,Premium Parka,VO,COD00003,61.09,EUR,7,West,8.727143,1
3,2009-01-03,2009-01-28,IT,PK003,Economy Parka,VO,COD00004,251.18,EUR,6,North,41.863333,1
4,2009-01-03,2009-01-18,DE,PK004,Basic Parka,VO,COD00005,153.33,EUR,5,West,30.666,1


In [72]:
df['Lead Time'] = (df['Requested Delivery Date'] - df['Order Date']).dt.days

def get_season(month):
    if month in [12, 1, 2]:
        return 1  # Winter
    elif month in [3, 4, 5]:
        return 2  # Spring
    elif month in [6, 7, 8]:
        return 3  # Summer
    else:
        return 4  # Fall

df['Quarter'] = df['Order Date'].dt.quarter

df['Season'] = df['Order Month'].apply(get_season)



df.head() 



Unnamed: 0,Order Date,Requested Delivery Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route,Value per Item,Order Month,Lead Time,Quarter,Season
0,2009-01-01,2009-02-18,DE,PK003,Economy Parka,VO,COD00001,269.87,EUR,8,East,33.73375,1,48,1,1
1,2009-01-01,2009-02-10,FR,PK001,Advanced Parka,VO,COD00002,170.34,EUR,2,West,85.17,1,40,1,1
2,2009-01-01,2009-01-17,ES,PK002,Premium Parka,VO,COD00003,61.09,EUR,7,West,8.727143,1,16,1,1
3,2009-01-03,2009-01-28,IT,PK003,Economy Parka,VO,COD00004,251.18,EUR,6,North,41.863333,1,25,1,1
4,2009-01-03,2009-01-18,DE,PK004,Basic Parka,VO,COD00005,153.33,EUR,5,West,30.666,1,15,1,1


In [73]:
df.sort_values(by='Order Date', inplace=True)

rolling_window = 3  

df['Lag_1_Month_Items'] = df.groupby('Product Code')['Items'].shift(1)

df['Rolling_Mean_3M_Items'] = df.groupby('Product Code')['Items'].rolling(window=rolling_window).mean().reset_index(0, drop=True)

df['Lag_1_Month_Items'] = df['Lag_1_Month_Items'].fillna(0)
df['Rolling_Mean_3M_Items'] = df['Rolling_Mean_3M_Items'].fillna(0)

df.head()





Unnamed: 0,Order Date,Requested Delivery Date,Customer Country Code,Product Code,Description,Order Type,Customer Order Code,Value,Currency,Items,Route,Value per Item,Order Month,Lead Time,Quarter,Season,Lag_1_Month_Items,Rolling_Mean_3M_Items
0,2009-01-01,2009-02-18,DE,PK003,Economy Parka,VO,COD00001,269.87,EUR,8,East,33.73375,1,48,1,1,0.0,0.0
1,2009-01-01,2009-02-10,FR,PK001,Advanced Parka,VO,COD00002,170.34,EUR,2,West,85.17,1,40,1,1,0.0,0.0
2,2009-01-01,2009-01-17,ES,PK002,Premium Parka,VO,COD00003,61.09,EUR,7,West,8.727143,1,16,1,1,0.0,0.0
3,2009-01-03,2009-01-28,IT,PK003,Economy Parka,VO,COD00004,251.18,EUR,6,North,41.863333,1,25,1,1,8.0,0.0
4,2009-01-03,2009-01-18,DE,PK004,Basic Parka,VO,COD00005,153.33,EUR,5,West,30.666,1,15,1,1,0.0,0.0


In [75]:
features = ['Season']
target = 'Product Code'

# Preprocessing: OneHotEncoder for categorical variables and StandardScaler for numerical variables
categorical_features = ['Season']
numerical_features = []

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))
])

# Prepare data for the model
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_probs = model.predict_proba(X_test)

# Evaluate the model using log loss
log_loss_value = log_loss(y_test, y_pred_probs, labels=model.classes_)
print(f'Log Loss: {log_loss_value:.4f}')

# Evaluate accuracy
y_pred = model.predict(X_test)
accuracy_value = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_value:.4f}')

Log Loss: 1.3933
Accuracy: 0.2289


In [74]:
features = ['Order Month', 'Customer Country Code', 'Route', 'Value per Item', 'Quarter', 'Season', 'Lag_1_Month_Items', 'Rolling_Mean_3M_Items']
target = 'Product Code'

# Preprocessing: OneHotEncoder for categorical variables and StandardScaler for numerical variables
categorical_features = ['Order Month', 'Customer Country Code', 'Route', 'Quarter', 'Season']
numerical_features = ['Lag_1_Month_Items', 'Rolling_Mean_3M_Items']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, multi_class='multinomial'))
])

# Prepare data for the model
X = df[features]
y = df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
model.fit(X_train, y_train)

# Predict probabilities for the test set
y_pred_probs = model.predict_proba(X_test)

# Evaluate the model using log loss
log_loss_value = log_loss(y_test, y_pred_probs, labels=model.classes_)
print(f'Log Loss: {log_loss_value:.4f}')

# Evaluate accuracy
y_pred = model.predict(X_test)
accuracy_value = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy_value:.4f}')

Log Loss: 1.4044
Accuracy: 0.2495
