In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
from google.colab import files

In [2]:
uploaded = files.upload()

Saving cleaned_store.csv to cleaned_store.csv


In [3]:
df = pd.read_csv('cleaned_store.csv')

# **Feature Engineering**

In [4]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [5]:
# Extract month from date
df['Year_Month'] = df['InvoiceDate'].dt.to_period('M')

In [6]:
# Group data by month and product
monthly_sales = df.groupby(['StockCode', 'Year_Month']).agg(
    Total_Quantity=('Quantity', 'sum'),
    Average_Price=('Price', 'mean')
).reset_index()

In [7]:
monthly_sales.sort_values(by=['StockCode', 'Year_Month'], ascending=[True, True], inplace=True)

In [8]:
monthly_sales['Log_Total_Quantity'] = monthly_sales['Total_Quantity'].apply(lambda x: np.log(x+1) if x > 0 else 0)
monthly_sales['Lagged_Quantity_1m'] = monthly_sales.groupby('StockCode')['Log_Total_Quantity'].shift(1)

In [9]:
monthly_sales['time_month'] = monthly_sales['Year_Month'].dt.month
monthly_sales = pd.get_dummies(monthly_sales, columns=['time_month'], prefix='month')
for month in range(1, 13):
    column_name = f'month_{month}'
    monthly_sales[column_name] = monthly_sales[column_name].astype(int)

In [10]:
monthly_sales.head()

Unnamed: 0,StockCode,Year_Month,Total_Quantity,Average_Price,Log_Total_Quantity,Lagged_Quantity_1m,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,10002,2009-12,212,0.85,5.361292,,0,0,0,0,0,0,0,0,0,0,0,1
1,10002,2010-01,289,0.85,5.669881,5.361292,1,0,0,0,0,0,0,0,0,0,0,0
2,10002,2010-02,255,0.85,5.545177,5.669881,0,1,0,0,0,0,0,0,0,0,0,0
3,10002,2010-03,633,0.8175,6.452049,5.545177,0,0,1,0,0,0,0,0,0,0,0,0
4,10002,2010-04,1129,0.819048,7.029973,6.452049,0,0,0,1,0,0,0,0,0,0,0,0


In [11]:
monthly_sales.dropna(inplace=True)

In [12]:
monthly_sales.drop(columns=['month_12'], axis=1, inplace=True)

In [13]:
monthly_sales.head()

Unnamed: 0,StockCode,Year_Month,Total_Quantity,Average_Price,Log_Total_Quantity,Lagged_Quantity_1m,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11
1,10002,2010-01,289,0.85,5.669881,5.361292,1,0,0,0,0,0,0,0,0,0,0
2,10002,2010-02,255,0.85,5.545177,5.669881,0,1,0,0,0,0,0,0,0,0,0
3,10002,2010-03,633,0.8175,6.452049,5.545177,0,0,1,0,0,0,0,0,0,0,0
4,10002,2010-04,1129,0.819048,7.029973,6.452049,0,0,0,1,0,0,0,0,0,0,0
5,10002,2010-05,1409,0.832069,7.251345,7.029973,0,0,0,0,1,0,0,0,0,0,0


# **Train/Test Split**

In [14]:
# Define the condition for the test set
test_set_condition = (monthly_sales['Year_Month'].dt.year == 2011) & (monthly_sales['Year_Month'].dt.month >= 7)

# Create the test set using the condition
test_set = monthly_sales[test_set_condition]

# Create the train set by negating the test set condition
train_set = monthly_sales[~test_set_condition]

In [15]:
features = ['Average_Price', 'Lagged_Quantity_1m',
            'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
            'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11']
target = 'Log_Total_Quantity'

X_train = train_set[features]
y_train = train_set[target]
X_test = test_set[features]
y_test = test_set[target]

### In the next part of the project I will run a series of machine learning models to perform a one step-ahead sales forecasting and choose the best model to predict sales

# **Linear Regression**

In [16]:
# Initialize the model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_percentage_error
model_lr = LinearRegression()

In [17]:
# Initialize the TimeSeriesSplit
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
tscv = TimeSeriesSplit(n_splits=5)
mse_scores = []
for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model_lr.fit(X_train_fold, y_train_fold)
    y_pred = model_lr.predict(X_test_fold)
    mse = mean_squared_error(y_test_fold, y_pred)
    mse_scores.append(mse)

print("Mean Squared Error scores:", mse_scores)
print("Average Mean Squared Error:", np.mean(mse_scores))

Mean Squared Error scores: [0.898296978780251, 0.9258984894670854, 0.9433943601809315, 0.9265679844292202, 1.0153298459335112]
Average Mean Squared Error: 0.9418975317581999


In [18]:
# Predict on test set
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

MSE: 0.9048389653075241


# **Random Forest Regressor**

In [19]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [20]:
# Initialize the model
model_rf = RandomForestRegressor(random_state=42)

In [21]:
# Time Series Cross-Validation
mse_scores = []
for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model_rf.fit(X_train_fold, y_train_fold)
    y_pred = model_rf.predict(X_test_fold)
    mse = mean_squared_error(y_test_fold, y_pred)
    mse_scores.append(mse)

print("Mean Squared Error scores:", mse_scores)
print("Average Mean Squared Error:", np.mean(mse_scores))

Mean Squared Error scores: [0.9075844123619664, 0.9639768429417055, 0.858983710582884, 0.8927400782718232, 0.8990045095513161]
Average Mean Squared Error: 0.9044579107419389


In [22]:
# Randomized Grid search
param_grid = {
    'n_estimators': [50,100,200,500],
    'max_depth': range(1,21)
}
grid_search = RandomizedSearchCV(estimator=model_rf, param_distributions=param_grid, n_iter=16, cv=tscv, scoring='neg_mean_squared_error', random_state=42) # negative because grid search expects a score to be maximized
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Negative Mean Squared Error:", best_score)

Best Parameters: {'n_estimators': 100, 'max_depth': 12}
Best Negative Mean Squared Error: 0.8725831269027831


best_params_ in scikit-learn is a naming convention used to indicate that these attributes are generated or set during the fitting process of the estimator. This convention helps differentiate these attributes from the parameters that are passed to the estimator at initialization. The underscore signifies that the attribute is a result of the fitting process and is not available until the estimator has been fitted. This helps avoid confusion with parameters set at initialization.

Strategy of choosing n-iter:
A common starting point is to use 10% of the total possible combinations in the hyperparameter space, but this can be adjusted based on available resources and specific requirements.

In [23]:
# Evaluate on test set
best_model_rf = grid_search.best_estimator_
best_model_rf.fit(X_train, y_train)
y_pred = best_model_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [24]:
print("MSE:", mse)

MSE: 0.9688979893883821


# **Support Vector Regression**

In [25]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
# Define the SVR model
model_svr = SVR(kernel='rbf')
mse_scores = []
mape_scores = []
for train_index, test_index in tscv.split(X_train):
    X_train_fold, X_test_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_test_fold = y_train.iloc[train_index], y_train.iloc[test_index]

    model_svr.fit(X_train_fold, y_train_fold)
    y_pred = model_svr.predict(X_test_fold)
    mse = mean_squared_error(y_test_fold, y_pred)
    mse_scores.append(mse)

print("Mean Squared Error scores:", mse_scores)
print("Average Mean Squared Error:", np.mean(mse_scores))

Mean Squared Error scores: [0.8698152687550806, 0.8977634271998568, 0.8862300467754946, 0.9127726555827897, 0.9274369068490458]
Average Mean Squared Error: 0.8988036610324535


In [26]:
param_grid = {
    'epsilon': [0.01, 0.1, 0.5, 1.0]
}
grid_search = GridSearchCV(estimator=model_svr, param_grid=param_grid, cv=tscv, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Get the best parameters and score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert back to positive because we used neg_mean_squared_error

print("Best Parameters:", best_params)
print("Best Mean Squared Error:", best_score)

Best Parameters: {'epsilon': 0.5}
Best Mean Squared Error: 0.897965133400257


In [27]:
# Evaluate on test set
best_model_svr = grid_search.best_estimator_
best_model_svr.fit(X_train, y_train)
y_pred = best_model_svr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [28]:
print("MSE:", mse)

MSE: 0.8977970130210812


Support Vector Regression is the best model given the lowest MSE result