# Modeling Notebook

End-to-end notebook: feature engineering, scaling, training (RandomForest + XGBoost), evaluation, and model export.

**Paths and outputs:**
- Input: `cleaned_engineered_data.csv` in the data folder
- Outputs: `walmart_sales_predictions.csv`, `rf_model.pkl`, `xgb_model.pkl`, `scaler.pkl`

Run cells top → bottom.

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import joblib


In [41]:
df = pd.read_csv("data/cleaned_engineered_data_cleaned.csv")
print("Data shape:", df.shape)
display(df.head())


Data shape: (10000, 24)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,IsHoliday,Month,Week,Year,Day,DayOfWeek,Weekly_Sales_Rolling4,Sales_vs_StoreMean,Type_B,Type_C
0,1,1,2010-02-05,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,...,0,2,5,2010,5,4,,1.338132,0,0
1,1,1,2010-02-12,46039.49,38.51,2.548,0.0,0.0,0.0,0.0,...,1,2,6,2010,12,4,,2.471742,0,0
2,1,1,2010-02-19,41595.55,39.93,2.514,0.0,0.0,0.0,0.0,...,0,2,7,2010,19,4,,2.233158,0,0
3,1,1,2010-02-26,19403.54,46.63,2.561,0.0,0.0,0.0,0.0,...,0,2,8,2010,26,4,32990.77,1.041726,0,0
4,1,1,2010-03-05,21827.9,46.5,2.625,0.0,0.0,0.0,0.0,...,0,3,9,2010,5,4,32216.62,1.171884,0,0


In [48]:
print("Columns with NaN in X_train:")
print(X_train.isna().sum()[X_train.isna().sum() > 0])


Columns with NaN in X_train:
Weekly_Sales_Rolling4    185
dtype: int64


In [49]:
numeric_cols = X_train.select_dtypes(include=np.number).columns

# Fill missing numeric values with median
for col in numeric_cols:
    median_val = X_train[col].median()
    X_train[col] = X_train[col].fillna(median_val)
    X_test[col] = X_test[col].fillna(median_val)


In [42]:
target_col = "Weekly_Sales"
X = df.drop(target_col, axis=1)
y = df[target_col]


In [43]:
# Convert Date to numeric features first
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Day'] = df['Date'].dt.day
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df = df.drop('Date', axis=1)

# Encode any remaining categorical string columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Now define X and y
target_col = "Weekly_Sales"
X = df.drop(target_col, axis=1)
y = df[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [44]:
numeric_cols = X_train.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


In [45]:
print(X_train.select_dtypes(include=['object']).columns)


Index([], dtype='object')


In [50]:
# Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)


0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [51]:
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    return mae, rmse, r2

print("Linear Regression:", evaluate(lin_model, X_test, y_test))
print("Random Forest:", evaluate(rf_model, X_test, y_test))


Linear Regression: (25.918898715197138, np.float64(162.75334303716983), 0.9999353341715038)
Random Forest: (24.184958325000856, np.float64(202.89634296327642), 0.9998995006046615)


In [52]:
importances = rf_model.feature_importances_
feat_imp = sorted(zip(X_train.columns, importances), key=lambda x: x[1], reverse=True)[:10]
print("\nTop 10 feature importances (Random Forest):")
for f, imp in feat_imp:
    print(f"{f}: {imp:.4f}")



Top 10 feature importances (Random Forest):
Sales_vs_StoreMean: 0.9996
Store: 0.0002
Size: 0.0001
Unemployment: 0.0001
Dept: 0.0000
Temperature: 0.0000
CPI: 0.0000
Weekly_Sales_Rolling4: 0.0000
Day: 0.0000
Fuel_Price: 0.0000


In [53]:
joblib.dump(rf_model, "rf_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(list(X.columns), "input_features.pkl")


['input_features.pkl']