In [189]:

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt


In [190]:
DATA_DIR    = "/kaggle/input/walmart-sales-forecasting"  # e.g., "/kaggle/input/walmart-recruiting-store-sales-forecasting"
TRAIN_FILE  = os.path.join(DATA_DIR, "/kaggle/input/walmart-sales-forecasting/clean_data.csv")     # must have Date, Store, Dept, Weekly_Sales


TARGET_COL = "Weekly_Sales"

In [191]:
df = pd.read_csv(TRAIN_FILE)
print("Train shape:", df.shape)
print("Columns:", df.columns.tolist())

Train shape: (420212, 24)
Columns: ['Unnamed: 0', 'Store', 'Dept', 'Date', 'Weekly_Sales', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Super_Bowl', 'Labor_Day', 'Thanksgiving', 'Christmas', 'week', 'month', 'year']


In [192]:
# Ensure Date is datetime
df["Date"] = pd.to_datetime(df["Date"])

In [193]:
# 2) Choose a single Store/Dept
# -----------------------------
pair_counts = df.groupby(["Store", "Dept"]).size().sort_values(ascending=False)
top_store, top_dept = pair_counts.index[0]
print(f"Using Store={top_store}, Dept={top_dept}")
# 2) Choose a single Store/Dept
# -----------------------------
df = df[(df["Store"] == top_store) & (df["Dept"] == top_dept)].copy()
df = df.sort_values("Date").reset_index(drop=True)
print(f"Filtered data shape: {df.shape}")

Using Store=45, Dept=97
Filtered data shape: (143, 24)


In [194]:
df = df[(df["Store"] == top_store) & (df["Dept"] == top_dept)].copy()
df = df.sort_values("Date").reset_index(drop=True)

In [195]:
df = df[(df["Store"] == top_store) & (df["Dept"] == top_dept)].copy()
df = df.sort_values("Date").reset_index(drop=True)

In [196]:
# 3) Feature Engineering
# -----------------------------
def make_time_features(frame):
    f = frame.copy()
    f["Year"] = f["Date"].dt.year
    f["Month"] = f["Date"].dt.month
    f["Week"] = f["Date"].dt.isocalendar().week.astype(int)
    f["DayOfWeek"] = f["Date"].dt.dayofweek
    f["IsMonthStart"] = f["Date"].dt.is_month_start.astype(int)
    f["IsMonthEnd"] = f["Date"].dt.is_month_end.astype(int)
    return f

In [197]:
df = make_time_features(df)
df = df.sort_values("Date")

In [198]:

# Lags - only create lags if we have enough data
for lag in [1, 2, 4, 7, 13, 26, 52]:
    if len(df) > lag:  # Only create lag if we have enough data
        df[f"lag_{lag}"] = df[TARGET_COL].shift(lag)
    else:
        print(f"Skipping lag_{lag} - not enough data")


In [199]:
# Rolling means - only create if we have enough data
if len(df) >= 4:
    df["roll_mean_4"] = df[TARGET_COL].shift(1).rolling(window=4, min_periods=1).mean()
if len(df) >= 13:
    df["roll_mean_13"] = df[TARGET_COL].shift(1).rolling(window=13, min_periods=1).mean()
if len(df) >= 52:
    df["roll_mean_52"] = df[TARGET_COL].shift(1).rolling(window=52, min_periods=1).mean()

In [200]:
# Drop NaNs
df = df.dropna().reset_index(drop=True)
print(f"Data shape after dropping NaNs: {df.shape}")

if len(df) == 0:
    raise ValueError("No data left after dropping NaNs. Try using a different Store/Dept combination.")


Data shape after dropping NaNs: (91, 40)


In [201]:
# 4) Train/Validation Split
# -----------------------------
df = df.sort_values("Date")
cutoff_index = int(len(df) * 0.8)
train_df = df.iloc[:cutoff_index].copy()
val_df   = df.iloc[cutoff_index:].copy()

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

Train size: 72, Validation size: 19


In [202]:
# 5) Features
# -----------------------------
drop_cols = ["Date", TARGET_COL, "Store", "Dept"]
feature_cols = [c for c in df.columns if c not in drop_cols]

In [203]:
X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]
X_val   = val_df[feature_cols]
y_val   = val_df[TARGET_COL]

In [204]:
print(f"Feature columns: {feature_cols}")
print(f"X_train shape: {X_train.shape}, X_val shape: {X_val.shape}")


Feature columns: ['Unnamed: 0', 'IsHoliday', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Type', 'Size', 'Super_Bowl', 'Labor_Day', 'Thanksgiving', 'Christmas', 'week', 'month', 'year', 'Year', 'Month', 'Week', 'DayOfWeek', 'IsMonthStart', 'IsMonthEnd', 'lag_1', 'lag_2', 'lag_4', 'lag_7', 'lag_13', 'lag_26', 'lag_52', 'roll_mean_4', 'roll_mean_13', 'roll_mean_52']
X_train shape: (72, 36), X_val shape: (19, 36)


In [205]:
# Preprocessing - handle only numerical columns
num_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

In [206]:

print(f"Numerical columns: {num_cols}")
print(f"Categorical columns: {cat_cols}")


Numerical columns: ['Unnamed: 0', 'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size', 'week', 'month', 'year', 'Year', 'Month', 'Week', 'DayOfWeek', 'IsMonthStart', 'IsMonthEnd', 'lag_1', 'lag_2', 'lag_4', 'lag_7', 'lag_13', 'lag_26', 'lag_52', 'roll_mean_4', 'roll_mean_13', 'roll_mean_52']
Categorical columns: ['Type']


In [207]:
 #Only create transformers for columns that exist
transformers = []

if num_cols:
    num_tf = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc", StandardScaler())
    ])
    transformers.append(("num", num_tf, num_cols))

if cat_cols:
    # For categorical columns, we need to encode them
    from sklearn.preprocessing import OneHotEncoder
    cat_tf = Pipeline([
        ("imp", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    transformers.append(("cat", cat_tf, cat_cols))

if not transformers:
    raise ValueError("No features available for modeling")

pre = ColumnTransformer(transformers)

In [208]:
# 6) Model Training
# -----------------------------
def fit_eval(name, model, Xtr, ytr, Xva, yva):
    pipe = Pipeline([("pre", pre), ("model", model)])
    pipe.fit(Xtr, ytr)
    pred = pipe.predict(Xva)
    rmse = mean_squared_error(yva, pred, squared=False)
    mae  = mean_absolute_error(yva, pred)
    mape = np.mean(np.abs((yva - pred) / np.clip(np.abs(yva), 1e-8, None))) * 100.0
    print(f"{name} | RMSE: {rmse:,.2f}  MAE: {mae:,.2f}  MAPE: {mape:,.2f}%")
    return pipe, pd.DataFrame({"Date": val_df["Date"].values, "Actual": yva.values, "Pred": pred})


In [209]:
lin_pipe, lin_pred_df = fit_eval("LinearRegression", LinearRegression(), X_train, y_train, X_val, y_val)


LinearRegression | RMSE: 944.34  MAE: 825.41  MAPE: 12.77%


In [210]:
def plot_actual_vs_pred(df_pred, title):
    dfp = df_pred.sort_values("Date")
    plt.figure(figsize=(10,4))
    plt.plot(dfp["Date"], dfp["Actual"], label="Actual")
    plt.plot(dfp["Date"], dfp["Pred"], label="Predicted")
    plt.title(title)
    plt.xlabel("Date")
    plt.ylabel("Weekly Sales")
    plt.legend()
    plt.tight_layout()
    plt.show()
    plot_actual_vs_pred(lin_pred_df, "LinearRegression: Actual vs. Predicted (Validation)")

In [211]:
def fit_eval(name, model, Xtr, ytr, Xva, yva):
    pipe = Pipeline([("pre", pre), ("model", model)])
    pipe.fit(Xtr, ytr)
    pred = pipe.predict(Xva)
    rmse = mean_squared_error(yva, pred, squared=False)
    print(f"{name} RMSE: {rmse:.4f}")
    return pipe, pred

In [212]:
out_path = "walmart_val_predictions_linear.csv"
lin_pred_df.to_csv(out_path, index=False)
print(f"Saved validation predictions -> {out_path}")

Saved validation predictions -> walmart_val_predictions_linear.csv
