In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

## Data Preprocessing

In [2]:
df = pd.read_csv("walmart_cleaned.csv") #load walmart data into dataframe

In [3]:
df = df.loc[df["Store"] == 39] #filter data to only include Store 39 cause (128 mod 45 ) + 1 

In [4]:
df = df.loc[df["Dept"].isin([1, 17, 26])]

In [5]:
df.drop(['Unnamed: 0',"Store","Next week","MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"],axis=1,inplace=True)

In [6]:
df = df.loc[df['Weekly_Sales'] > 0]
df.shape

(429, 10)

In [7]:
from datetime import datetime

df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%d-%m-%y'))#more usable format for time series analysis

In [8]:
df.sort_values(by='Date', ascending=True, inplace=True) #sort the df by Date in ascending order

In [9]:
# computing the range of Date
oldest_date=df['Date'].iloc[0]
latest_date=df['Date'].iloc[-1]
date_range = pd.date_range(start=oldest_date, end=latest_date, freq='Y')
three_years_before = latest_date - pd.Timedelta(days=365*3)

### Data separated by one train year, one test year and the last validation

In [10]:
train_start = oldest_date
train_end = oldest_date + pd.Timedelta(days=365)

test_start = train_end
test_end = test_start + pd.Timedelta(days=365)

val_start = test_end
val_end = latest_date

train_df = df[(df['Date'] >= train_start) & (df['Date'] <= train_end)]
test_df = df[(df['Date'] >= test_start) & (df['Date'] <= test_end)]
val_df = df[(df['Date'] >= val_start) & (df['Date'] <= val_end)]

In [11]:
train_df

Unnamed: 0,Date,IsHoliday,Dept,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Type,Size
361077,2010-02-05,0,1,21244.50,44.30,2.572,209.852966,8.554,3,184109
361113,2010-02-05,0,26,8126.40,44.30,2.572,209.852966,8.554,3,184109
361133,2010-02-05,0,17,11486.59,44.30,2.572,209.852966,8.554,3,184109
361186,2010-02-12,1,1,39584.16,44.58,2.548,209.997021,8.554,3,184109
361194,2010-02-12,1,26,7651.69,44.58,2.548,209.997021,8.554,3,184109
...,...,...,...,...,...,...,...,...,...,...
364632,2011-01-28,0,1,14960.31,47.94,3.010,210.968241,8.395,3,184109
364647,2011-01-28,0,26,6480.16,47.94,3.010,210.968241,8.395,3,184109
364729,2011-02-04,0,26,7990.14,45.96,2.989,211.333375,8.395,3,184109
364695,2011-02-04,0,1,19488.92,45.96,2.989,211.333375,8.395,3,184109


In [12]:
print(f"Train set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"Validation set shape: {val_df.shape}")

Train set shape: (159, 10)
Test set shape: (156, 10)
Validation set shape: (114, 10)


Split into features and target

In [13]:
def create_target(df):
    df['Next_Week'] = df.groupby(['Dept'])['Weekly_Sales'].shift(-1)
    df.dropna(inplace=True)
    return df

train_df = create_target(train_df)
test_df = create_target(test_df)
val_df = create_target(val_df)

X_train = train_df.drop(['Next_Week','Date'], axis=1)
y_train = train_df['Next_Week']

X_test = test_df.drop(['Next_Week','Date'], axis=1)
y_test = test_df['Next_Week']

X_val = val_df.drop(['Next_Week','Date'], axis=1)
y_val = val_df['Next_Week']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next_Week'] = df.groupby(['Dept'])['Weekly_Sales'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Next_Week'] = df.groupby(['Dept'])['Weekly_Sales'].shift(-1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

## Training

### Hyperparameter tuning - Custom Grid Search

In [16]:
from sklearn.metrics import mean_absolute_error

param_grid = {'n_estimators': [100, 200, 300],
              'max_depth': [None, 10, 20]}

best_params = None
best_val_mae = float('inf')

for n_estimators in param_grid['n_estimators']:
    for max_depth in param_grid['max_depth']:
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1)
        model.fit(X_train, y_train)
        
        y_val_pred = model.predict(X_val)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        
        if val_mae < best_val_mae:
            best_params = {'n_estimators': n_estimators, 'max_depth': max_depth}
            best_val_mae = val_mae

print('Best parameters:', best_params)
print('Best validation MAE:', best_val_mae)

Best parameters: {'n_estimators': 100, 'max_depth': 20}
Best validation MAE: 2540.880620720723


Apply Best Hyperparameters

In [17]:
model = RandomForestRegressor(n_estimators=200, max_depth=10, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, n_estimators=200, n_jobs=-1)

### Prediction

Prediction for test set

In [18]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

y_pred_test = model.predict(X_test)

In [19]:
#evaluate the model on the test set
y_test_pred = model.predict(X_test)
test_mape = mean_absolute_percentage_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Test set MAPE: {test_mape}")
print(f"Test set MAE: {test_mae}")

Test set MAPE: 0.16137990871856508
Test set MAE: 2818.3162703719954


Prediction for Store 39

In [20]:
y_pred=model.predict(df.drop(['Date'],axis=1))

### Create csv

In [21]:
df["Next Week"]= y_pred

In [22]:
df.to_csv("walmart_cleaned_Store39.csv", index=False)