In [60]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV

In [61]:
# Import and initial analysis
df = pd.read_csv('sales_predictions_2023.csv')

# Data exploration and preprocessing
df['Item_Fat_Content'].replace({'LF':'Low Fat', 'low fat':'Low Fat', 'reg':'Regular'}, inplace=True)

In [62]:
# Handling missing values using K-Means clustering for Outlet_Size
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)



In [63]:
def fill_outlet_size(df, features):
    # Separate the rows with and without missing Outlet_Size
    train_not_missing = df[df['Outlet_Size'].notna()]
    train_missing = df[df['Outlet_Size'].isna()]
    
    # Fit KMeans on the non-missing Outlet_Size rows
    kmeans.fit(train_not_missing[features])
    
    # Predict cluster for the missing Outlet_Size rows
    train_missing.loc[:, 'cluster'] = kmeans.predict(train_missing[features])
    
    # Map clusters to Outlet_Size categories
    cluster_size_map = {0: 'Small', 1: 'Medium', 2: 'High'}
    train_missing.loc[:, 'Outlet_Size'] = train_missing['cluster'].map(cluster_size_map)
    
    # Concatenate and drop the 'cluster' column
    df_filled = pd.DataFrame(pd.concat([train_not_missing, train_missing]).drop(columns='cluster'))
    
    return df_filled


In [64]:
df_filled = fill_outlet_size(df, ['Item_Visibility'])

In [65]:
df_filled.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [66]:
df_filled.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [67]:
df_filled['Outlet_Age'] = 2024 - df_filled['Outlet_Establishment_Year']
df_filled

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380,25
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,15
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700,25
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,37
5,FDP36,10.395,Regular,0.000000,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8502,NCH43,8.420,Low Fat,0.070712,Household,216.4192,OUT045,2002,Small,Tier 2,Supermarket Type1,3020.0688,22
8508,FDW31,11.350,Regular,0.043246,Fruits and Vegetables,199.4742,OUT045,2002,Medium,Tier 2,Supermarket Type1,2587.9646,22
8509,FDG45,8.100,Low Fat,0.214306,Fruits and Vegetables,213.9902,OUT010,1998,High,Tier 3,Grocery Store,424.7804,26
8514,FDA01,15.000,Regular,0.054489,Canned,57.5904,OUT045,2002,Small,Tier 2,Supermarket Type1,468.7232,22


In [68]:
columns2drop = ['Item_Identifier','Outlet_Identifier','Item_Outlet_Sales','Outlet_Establishment_Year']

In [69]:
# Split the data
X = df_filled.drop(columns=columns2drop)
y = df_filled['Item_Outlet_Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipelines for numerical and categorical features
num_cols = X_train.select_dtypes('number').columns
cat_cols = X_train.drop(columns='Outlet_Size').select_dtypes('object').columns

num_pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())
ord_pipe = make_pipeline(OrdinalEncoder(categories=[['Small', 'Medium', 'High']]))
cat_pipe = make_pipeline(OneHotEncoder(handle_unknown='ignore', sparse_output=False))

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, num_cols[:-1]),  # excluding 'Item_Outlet_Sales'
        ('ord', ord_pipe, ['Outlet_Size']),
        ('cat', cat_pipe, cat_cols[:-1])  # excluding 'Outlet_Size'
    ])

# Define models
xgb_model = XGBRegressor(random_state=42)
catboost_model = CatBoostRegressor(silent=True, random_state=42)
lightgbm_model = LGBMRegressor(random_state=42)
linear_model = LinearRegression()
ridge_model = Ridge()
lasso_model = Lasso()

# Stacking Regressor
stacking_reg = StackingRegressor(
    estimators=[('xgb', xgb_model), ('catboost', catboost_model), ('lightgbm', lightgbm_model), ('ridge', ridge_model)],
    final_estimator=linear_model)

# Train models
models = [xgb_model, catboost_model, lightgbm_model, linear_model, ridge_model, lasso_model, stacking_reg]
results = {}

for model in models:
    model_pipeline = make_pipeline(preprocessor, model)
    model_pipeline.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model_pipeline.predict(X_train)
    y_test_pred = model_pipeline.predict(X_test)
    
    # Metrics
    train_metrics = {
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'MSE': mean_squared_error(y_train, y_train_pred),
        'RMSE': mean_squared_error(y_train, y_train_pred, squared=False),
        'R^2': r2_score(y_train, y_train_pred)
    }
    
    test_metrics = {
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'RMSE': mean_squared_error(y_test, y_test_pred, squared=False),
        'R^2': r2_score(y_test, y_test_pred)
    }
    
    results[model.__class__.__name__] = {'Training Metrics': train_metrics, 'Test Metrics': test_metrics}

# Display the results
for model_name, metrics in results.items():
    print(f"\nModel: {model_name}")
    print(f"Training Metrics: {metrics['Training Metrics']}")
    print(f"Test Metrics: {metrics['Test Metrics']}")


Model: XGBRegressor
Training Metrics: {'MAE': 505.3809471660781, 'MSE': 478593.081167028, 'RMSE': 691.8042217036754, 'R^2': 0.8363822140082481}
Test Metrics: {'MAE': 881.6823324027464, 'MSE': 1509319.3797944104, 'RMSE': 1228.5436010961964, 'R^2': 0.47202195369581257}

Model: CatBoostRegressor
Training Metrics: {'MAE': 650.965573094335, 'MSE': 793643.6576458479, 'RMSE': 890.8668012929026, 'R^2': 0.7286751036731124}
Test Metrics: {'MAE': 836.6826456190486, 'MSE': 1354215.4019463267, 'RMSE': 1163.7076101608714, 'R^2': 0.5262791879793834}

Model: LGBMRegressor
Training Metrics: {'MAE': 697.8485322067335, 'MSE': 913312.8440260444, 'RMSE': 955.6740260287733, 'R^2': 0.6877635065409159}
Test Metrics: {'MAE': 830.7510833111508, 'MSE': 1315984.9172746004, 'RMSE': 1147.1638580754714, 'R^2': 0.5396526706739406}

Model: LinearRegression
Training Metrics: {'MAE': 1021.0161233735147, 'MSE': 1904873.8024487554, 'RMSE': 1380.1716568777797, 'R^2': 0.3487761390315992}
Test Metrics: {'MAE': 1016.84072722

In [70]:


xgb_param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000, 2000],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_grid = GridSearchCV(estimator=xgb_model, param_grid=xgb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
xgb_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best XGBRegressor Params:", xgb_grid.best_params_)
print("Best XGBRegressor Score:", -xgb_grid.best_score_)


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best XGBRegressor Params: {'colsample_bytree': 0.8, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000, 'subsample': 0.8}
Best XGBRegressor Score: 1385862.891346199


In [71]:
from catboost import CatBoostRegressor

cat_param_grid = {
    'depth': [6, 8, 10],
    'iterations': [1000, 2000],
    'learning_rate': [0.01, 0.03, 0.1],
    'l2_leaf_reg': [3, 5, 7]
}
cat_grid = GridSearchCV(estimator=catboost_model, param_grid=cat_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
cat_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best CatBoostRegressor Params:", cat_grid.best_params_)
print("Best CatBoostRegressor Score:", -cat_grid.best_score_)


Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best CatBoostRegressor Params: {'depth': 6, 'iterations': 1000, 'l2_leaf_reg': 5, 'learning_rate': 0.01}
Best CatBoostRegressor Score: 1331956.7418159358


In [73]:
from lightgbm import LGBMRegressor

lgb_param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [500, 1000, 2000],
    'max_depth': [3, 5, 7]
}
lgb_grid = GridSearchCV(estimator=lightgbm_model, param_grid=lgb_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
lgb_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best LGBMRegressor Params:", lgb_grid.best_params_)
print("Best LGBMRegressor Score:", -lgb_grid.best_score_)


Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best LGBMRegressor Params: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500, 'num_leaves': 31}
Best LGBMRegressor Score: 1367420.4340220776


In [75]:
from sklearn.linear_model import Ridge

ridge_param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}
ridge_grid = GridSearchCV(estimator=ridge_model, param_grid=ridge_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
ridge_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best Ridge Params:", ridge_grid.best_params_)
print("Best Ridge Score:", -ridge_grid.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Ridge Params: {'alpha': 100.0}
Best Ridge Score: 1917591.031322245


In [76]:
from sklearn.linear_model import Lasso

lasso_param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}
lasso_grid = GridSearchCV(estimator=lasso_model, param_grid=lasso_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
lasso_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best Lasso Params:", lasso_grid.best_params_)
print("Best Lasso Score:", -lasso_grid.best_score_)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
Best Lasso Params: {'alpha': 10.0}
Best Lasso Score: 1914038.177044272


In [77]:
from sklearn.ensemble import StackingRegressor

estimators = [
    ('xgb', XGBRegressor(learning_rate=0.1, n_estimators=1000)),
    ('catboost', CatBoostRegressor(learning_rate=0.03, iterations=2000)),
    ('lgbm', LGBMRegressor(learning_rate=0.05, n_estimators=1000))
]

stacking_model = StackingRegressor(estimators=estimators, final_estimator=Ridge())
stacking_grid = GridSearchCV(estimator=stacking_model, param_grid={'final_estimator__alpha': [0.01, 0.1, 1.0]}, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
stacking_grid.fit(preprocessor.transform(X_train), y_train)

# Afficher les meilleurs hyperparamètres et le score
print("Best StackingRegressor Params:", stacking_grid.best_params_)
print("Best StackingRegressor Score:", -stacking_grid.best_score_)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
0:	learn: 1692.2051338	total: 5.26ms	remaining: 10.5s
1:	learn: 1674.4762176	total: 11ms	remaining: 11s
2:	learn: 1658.3575786	total: 15.5ms	remaining: 10.3s
3:	learn: 1642.5709589	total: 19.2ms	remaining: 9.56s
4:	learn: 1623.6372506	total: 22.5ms	remaining: 8.99s
5:	learn: 1604.2718990	total: 26.3ms	remaining: 8.74s
6:	learn: 1590.9426009	total: 29.7ms	remaining: 8.46s
7:	learn: 1576.9350234	total: 32.4ms	remaining: 8.06s
8:	learn: 1563.8205315	total: 34.9ms	remaining: 7.73s
9:	learn: 1545.9376490	total: 37ms	remaining: 7.37s
10:	learn: 1534.9810961	total: 39.1ms	remaining: 7.07s
11:	learn: 1518.3919658	total: 41.8ms	remaining: 6.93s
12:	learn: 1507.7081000	total: 44.2ms	remaining: 6.75s
13:	learn: 1492.0055825	total: 46.5ms	remaining: 6.59s
14:	learn: 1482.1883280	total: 48.5ms	remaining: 6.42s
15:	learn: 1469.2080182	total: 50.5ms	remaining: 6.26s
16:	learn: 1459.4546863	total: 52.5ms	remaining: 6.12s
17:	learn: 1451.59941