In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.chdir('..')

In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing, metrics
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sales_forecasting.utils import timeseries_split
from sales_forecasting.plot import plot_timeseries, plot_feature_importance
from sales_forecasting.features import col_name

In [4]:
df = pd.read_parquet(".data/df_agg_monthly_oversampled.parquet")

In [5]:
train_split, valid_split = timeseries_split(df, max_month=33, col='date_block_num', continuous=False)
train_test_split, test_split = timeseries_split(df, max_month=34, col='date_block_num', continuous=False)

In [6]:
target_col = 'item_cnt_month'
train_target, valid_target = train_split[target_col].clip(0, 20), valid_split[target_col].clip(0, 20)

In [7]:
cols_to_drop = [target_col, 'date_block_num', 'shop_id', 'item_id']
X_train, X_valid = train_split.drop(columns=cols_to_drop), valid_split.drop(columns=cols_to_drop)

# Naive model

In [None]:
class NaiveMeanModel:
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def fit(self):
        self.mean = self.y.mean()

    def transform(self, X):
        return np.repeat(self.mean, X.shape[0])

    def predict(self, X):
        return self.transform(X)
    
model = NaiveMeanModel(X_train, train_target)
model.fit()
y_pred = model.predict(X_valid).clip(0, 20)
metrics.root_mean_squared_error(valid_target.values, y_pred)

np.float64(0.5088543126318588)

# Linear Regression

In [8]:
from sklearn.compose import ColumnTransformer

ohe_cols = ['city_id', 'item_category_id', 'general_item_category_id', 'date_month']
num_cols = [*col_name("lagged", list(range(1, 12))), *col_name('rolling', [3, 6, 9]), 'months_since_last_buy']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessing.StandardScaler(), num_cols),
        ('cat', preprocessing.OneHotEncoder(handle_unknown='ignore'), ohe_cols)
    ]
)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_valid_preprocessed = preprocessor.transform(X_valid)

In [46]:
model = LinearRegression()
model.fit(X_train_preprocessed, train_target.values)
y_pred = model.predict(X_valid_preprocessed).clip(0, 20)
metrics.root_mean_squared_error(valid_target.values, y_pred)

np.float64(0.4580079547332044)

# Decision Tree

In [51]:
model = DecisionTreeRegressor(max_depth=10)
model.fit(X_train_preprocessed, train_target.values)
y_pred = model.predict(X_valid_preprocessed).clip(0, 20)
metrics.root_mean_squared_error(valid_target.values, y_pred)

np.float64(0.4625788347191035)

# Random Forest

In [None]:
model = RandomForestRegressor(n_estimators=10)
model.fit(X_train_preprocessed, train_target.values)
y_pred = model.predict(X_valid_preprocessed).clip(0, 20)
metrics.root_mean_squared_error(valid_target.values, y_pred)

# MLP

In [9]:
model = MLPRegressor(max_iter=10, hidden_layer_sizes=[256, ], verbose=True)
model.fit(X_train_preprocessed, train_target.values)
y_pred = model.predict(X_valid_preprocessed).clip(0, 20)
metrics.root_mean_squared_error(valid_target.values, y_pred)

Iteration 1, loss = 0.68171471
Iteration 2, loss = 0.82266553
Iteration 3, loss = 0.53446429
Iteration 4, loss = 0.19279754
Iteration 5, loss = 0.11174258
Iteration 6, loss = 0.09988425
Iteration 7, loss = 0.09336469




np.float64(0.42443140968739057)