In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib
import os


# Business Understanding

## Statement

One of the largest retail chains in the world wants to use their vast data source to build an efficient forecasting model to predict the sales for each SKU in its portfolio at its 76 different stores using historical sales data for the past 3 years on a week-on-week basis. Sales and promotional information is also available for each week - product and store wise.

However, no other information regarding stores and products are available. Can you still forecast accurately the sales values for every such product/SKU-store combination for the next 12 weeks accurately? If yes, then dive right in!

## Data Dictionary

| Field Name        | Data Type / Format                     | Example | Description                                                                                         | Allowed / Expected Range          |
|-------------------|----------------------------------------|---------|-----------------------------------------------------------------------------------------------------|-----------------------------------|
| `record_ID`       | `int64` (sequential key)               | `1`     | Surrogate key, unique for each row. No business meaning.                                            | ≥ 1, unique                        |
| `week`            | `string` date in **YY/MM/DD** (e.g. 17/01/11) | `17/01/11` | Calendar week identifier; usually week‑ending date. Convert to `datetime` for analysis.             | Valid dates                        |
| `store_id`        | `int32`                                | `8091`  | Identifier of the retail outlet. Links to **Store** dimension.                                      | Positive integers                  |
| `sku_id`          | `int32`                                | `216418`| Identifier for the Stock‑Keeping Unit. Links to **Product/SKU** dimension.                          | Positive integers                  |
| `total_price`     | `float64` (currency)                   | `99.0375`| Actual net revenue for the SKU‑store‑week (after discounts).                                        | ≥ 0                                |
| `base_price`      | `float64` (currency)                   | `111.8625`| Regular (list) price for the SKU in that week.                                                      | ≥ 0                                |
| `is_featured_sku` | `int8` (binary flag) → `bool`          | `0`     | Was the SKU advertised in a flyer/e‑mail feature? `1` = Yes, `0` = No.                              | {0, 1}                             |
| `is_display_sku`  | `int8` (binary flag) → `bool`          | `0`     | Did the SKU have an in‑store display? `1` = Yes, `0` = No.                                          | {0, 1}                             |
| `units_sold`      | `int64`                                | `20`    | Quantity sold for the SKU‑store‑week.                                                               | ≥ 0                                |

# EDA

In [11]:
df = pd.read_csv("../data/raw/train.csv")
df.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,17/01/11,8091,216418,99.0375,111.8625,0,0,20
1,2,17/01/11,8091,216419,99.0375,99.0375,0,0,28
2,3,17/01/11,8091,216425,133.95,133.95,0,0,19
3,4,17/01/11,8091,216233,133.95,133.95,0,0,44
4,5,17/01/11,8091,217390,141.075,141.075,0,0,52


In [12]:
df.describe()

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
count,150150.0,150150.0,150150.0,150149.0,150150.0,150150.0,150150.0,150150.0
mean,106271.555504,9199.422511,254761.132468,206.626751,219.425927,0.095611,0.1332,51.674206
std,61386.037861,615.591445,85547.306447,103.308516,110.961712,0.294058,0.339792,60.207904
min,1.0,8023.0,216233.0,41.325,61.275,0.0,0.0,1.0
25%,53111.25,8562.0,217217.0,130.3875,133.2375,0.0,0.0,20.0
50%,106226.5,9371.0,222087.0,198.075,205.9125,0.0,0.0,35.0
75%,159452.75,9731.0,245338.0,233.7,234.4125,0.0,0.0,62.0
max,212644.0,9984.0,679023.0,562.1625,562.1625,1.0,1.0,2876.0


# Data Preprocessing

In [13]:
df['week'] = pd.to_datetime(df['week'], format='%y/%m/%d')

# Sort by keys to ensure correct lag computation
df = df.sort_values(['store_id', 'sku_id', 'week'])

# Create 12 lag features of units_sold for each SKU‑store combo
for lag in range(1, 13):
    df[f'units_sold_lag_{lag}'] = df.groupby(['store_id', 'sku_id'])['units_sold'].shift(lag)

# Drop rows that don't have full lag history
df_model = df.dropna().copy()

# Feature matrix and target vector
feature_cols = (
    ['store_id', 'sku_id', 'base_price', 'total_price',
     'is_featured_sku', 'is_display_sku'] +
    [f'units_sold_lag_{lag}' for lag in range(1, 13)]
)
X = df_model[feature_cols]
y = df_model['units_sold']


In [20]:
# Store data to the preprocessed folder
saved_data = pd.concat([X, y], axis=1)
saved_data.to_csv("../data/processed/train.csv", index=False)

# Modeling

In [15]:
# Use the last 12 weeks of data as a hold‑out validation set
cutoff_date = df_model['week'].max() - pd.Timedelta(weeks=12)
train_mask = df_model['week'] < cutoff_date
X_train, X_val = X[train_mask], X[~train_mask]
y_train, y_val = y[train_mask], y[~train_mask]

In [16]:
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42,
    oob_score=False
)

In [17]:
model.fit(X_train, y_train)

In [18]:
val_preds = model.predict(X_val)
mae = mean_absolute_error(y_val, val_preds)
print(f"\nValidation MAE (last 12 weeks): {mae:,.3f}")


Validation MAE (last 12 weeks): 10.967


# Model Exporting

In [19]:
model_path = "../models/random_forest_model.pkl"
joblib.dump(model, model_path)

print(f"\nModel saved to: {model_path}")


Model saved to: ../models/random_forest_model.pkl
