In [110]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [99]:
df = pd.read_csv('data/df.csv', index_col=0)
df = df.sort_values(by='date')
df.shape

(226486, 5)

In [100]:
# Generate features
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].apply(lambda x: x.year)
df['month'] = df['date'].apply(lambda x: x.month)
df['day'] = df['date'].apply(lambda x: x.day)

# encode categories
df_categories = pd.get_dummies(df['category_id'])
df_categories.columns = ['category_1', 'category_2', 'category_3']
df = pd.concat([df, df_categories], axis=1).drop('category_id', axis=1)

# calculate growing sums
df['growing_sum_sales_quantity'] = df.groupby('sku_id')['sales_quantity'].cumsum()
df

Unnamed: 0,date,sku_id,sales_price,sales_quantity,year,month,day,category_1,category_2,category_3,growing_sum_sales_quantity
249598,2016-11-18,415510,23.205,1.0,2016,11,18,0,1,0,1.0
249608,2016-11-18,420009,22.490,2.0,2016,11,18,0,1,0,2.0
714145,2016-11-18,567734,23.205,1.0,2016,11,18,0,1,0,1.0
699986,2016-11-18,556333,32.760,2.0,2016,11,18,0,1,0,2.0
683507,2016-11-18,566241,283.400,1.0,2016,11,18,1,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
20699585,2020-10-31,458375,349.700,1.0,2020,10,31,1,0,0,111.0
20699624,2020-10-31,487116,388.700,1.0,2020,10,31,1,0,0,164.0
20699682,2020-10-31,132689,356.200,1.0,2020,10,31,1,0,0,356.0
20688487,2020-10-31,621322,54.600,1.0,2020,10,31,0,1,0,1979.0


In [112]:
# Splitting data into train and test
number_of_dates = df.date.nunique()
dates = df.date.unique()
date_to_split = dates[int(number_of_dates*0.8)]

train = df[df.date < date_to_split].drop('date', axis=1)
test = df[df.date >= date_to_split].drop('date', axis=1)

dates_for_plot = df.loc[df.date >= date_to_split, 'date']

X_train, y_train = train.drop('sales_quantity', axis=1), train['sales_quantity']
X_test, y_test = test.drop('sales_quantity', axis=1), test['sales_quantity']
X_train.shape, y_train.shape

((171604, 9), (171604,))

In [119]:
# Define parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200],  
    'max_depth': [10, 20],       
    'min_samples_split': [5, 10]   
}
model = RandomForestRegressor()

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)

# Perform GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best MAE Score:", best_score)


Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best MAE Score: -8.317394856519277


In [None]:
# Updated model
model_updated = RandomForestRegressor(max_depth=10, min_samples_split=5, n_estimators=100)
model_updated.fit(X_train, y_train)

# Make predictions
y_pred = model_updated.predict(X_test)

# Evaluate the model
mae_train = mean_absolute_error(y_train, model_updated.predict(X_train))
mae_test = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error Train:", mae_train)
print("Mean Absolute Error Test:", mae_test)

# Visualize predictions
plt.plot(dates_for_plot, y_test, label='Actual')
plt.plot(dates_for_plot, y_pred, label='Predicted')
plt.xlabel('Time')
plt.ylabel('Target Variable')
plt.title('Actual vs Predicted')
plt.legend()
plt.show()