#### Install Necessary Libraries

In [43]:
!pip install gensim
!pip install xgboost
!pip install catboost
!pip install lightgbm 
!pip install seaborn



#### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
import matplotlib.style as style # for styling the graphs
# style.available (to know the available list of styles)
style.use('ggplot') # chosen style
plt.rc('xtick',labelsize=13) # to globally set the tick size
plt.rc('ytick',labelsize=13) # to globally set the tick size
# To print multiple outputs together
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')
# Change column display number during print
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)

#### Load Dataset

In [2]:
path_1 = './BaseLine-No-feature-engineering-Train.csv'
path_2 = './Baseline-No-Feature-engineering-Test.csv'

In [3]:
train_df = pd.read_csv(path_1)
val_df = pd.read_csv(path_2)

In [5]:
train_df

Unnamed: 0,sku_name,starting_inventory,sellin,sellin_channel_1,sellin_channel_2,sellin_channel_3,sellin_channel_4,sellin_channel_5,sellin_channel_6,sellin_channel_7,sellin_channel_8,sellout,onhand_inventory,leftover_inventory,sellout_channel_1,sellout_channel_2,sellout_channel_3,sellout_channel_4,sellout_channel_5,sellout_channel_6,sellout_channel_7,sellout_channel_8,sellout_channel_9,sellout_channel_10,onhand_inventory_channel_1,onhand_inventory_channel_2,onhand_inventory_channel_3,onhand_inventory_channel_4,onhand_inventory_channel_5,onhand_inventory_channel_6,onhand_inventory_channel_7,onhand_inventory_channel_8,onhand_inventory_channel_9,onhand_inventory_channel_10,price,month,year,product_lifecycle_stage,FLAG100,disc_month,cum_disc,CAT_GENDER_BOTH,CAT_GENDER_MEN,CAT_GENDER_WOMEN,Weeks,product_sku_embedding
0,YOSHTLYNYOSH,0,32416,4052,0,23299,0,0,5065,0,0,28364,572345,4052,0,0,22286,0,2026,3039,0,0,0,1013,0,0,515617,5065,37481,0,0,0,0,14182,245,1,2016,1,0.000000,0,0,0,1,0,1,0.852468
1,YOSHRTHATRAN,0,346446,15195,0,300861,4052,21273,4052,0,0,9117,337329,337329,0,0,2026,0,3039,4052,0,0,0,0,6078,0,298835,4052,16208,0,0,0,3039,9117,105,1,2016,1,0.000000,0,0,0,0,1,1,0.133018
2,YOSHOVANTERR,0,1013,0,0,0,1013,0,0,0,0,0,1013,1013,0,0,0,0,0,0,0,0,0,0,0,0,0,1013,0,0,0,0,0,0,115,1,2016,1,1.000000,0,0,0,0,1,1,0.265031
3,YOSHLROYARTI,0,1113287,13169,0,0,1072767,0,5065,0,0,145872,2194158,967415,98261,7091,15195,1013,1013,1013,8104,0,8104,6078,360628,107378,561202,936012,18234,0,16208,0,177275,17221,115,1,2016,5,0.000000,0,0,0,0,1,1,0.636286
4,YOSHLEENARMA,0,16208,1013,0,0,0,0,0,0,0,350498,1288536,-334290,99274,62806,143846,1013,11143,0,0,0,25325,7091,210704,465980,373797,20260,73949,0,0,3039,132703,8104,125,1,2016,2,0.000000,0,0,0,1,0,1,0.359307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43013,ABEWARDREYZZ,937025,148911,78001,0,46598,0,24312,0,0,0,0,42546,148911,0,0,0,0,0,0,0,0,0,0,0,0,23299,0,0,0,0,0,0,19247,149,7,2021,15,0.000000,0,1,0,1,0,0,0.907726
43014,ABEETTEABE,4861387,774945,320108,422421,0,0,0,17221,3039,0,196522,2584163,578423,146885,0,18234,3039,1013,16208,0,0,0,11143,2354212,0,168158,2026,10130,0,1013,0,0,48624,159,7,2021,7,0.000000,0,1,0,1,0,0,1.763638
43015,ABEENNEARMAZZ,212730,208678,0,138781,0,0,0,56728,0,0,203613,781023,5065,56728,0,82053,0,0,51663,0,0,0,13169,513591,0,223873,10130,14182,0,0,0,0,19247,129,7,2021,2,0.192995,1,1,0,0,1,0,0.109360
43016,ABEANNAONEIZZ,160054,50650,6078,0,0,0,0,41533,0,0,58754,176262,-8104,16208,0,0,0,0,40520,0,0,0,2026,153976,0,0,3039,12156,0,0,0,0,7091,129,7,2021,2,0.888889,1,1,0,0,1,0,0.193657


#### Drop SKU NAME column from dataset - Since it is a text column and we have already created a encoded feature it called as sku_encoded

In [48]:
train_df.drop('sku_name',1,inplace=True)
val_df.drop('sku_name',1,inplace=True)

#### Drop Rows with Negative values in Target Variable

In [49]:
rows_negative_vlaues = train_df[train_df["sellin"]<0]
train_df.drop(rows_negative_vlaues.index, axis=0, inplace = True)

rows_negative_vlaues = val_df[val_df["sellin"]<0]
val_df.drop(rows_negative_vlaues.index, axis=0, inplace = True)

#### Create Baseline Model

In [51]:
import pandas as pd  # Import pandas for DataFrame handling
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, explained_variance_score
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LinearRegression

# Training features
X = np.array(train_df.drop('sellin', axis=1))
y = np.array(np.log1p(train_df["sellin"]))

# Test features
X_test = np.array(val_df.drop('sellin', axis=1))
y_test = np.array(np.log1p(val_df["sellin"]))

# Define the models
models = [
    ("Decision Tree", DecisionTreeRegressor(random_state=9)),
    ("Linear Regression",LinearRegression()),
     ("Random Forest", RandomForestRegressor(n_estimators=100, random_state=42)),
     ("XGBoost", XGBRegressor(n_estimators = 100,objective='reg:squarederror', random_state=42)),
   ("CatBoost", CatBoostRegressor(n_estimators = 100,verbose=0, random_state=42)),
     ("LightGBM", LGBMRegressor(n_estimators = 100,random_state=42))
]

# Define the KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# DataFrame to store feature importances
feature_importances_df = pd.DataFrame(index=train_df.drop(['sellin'],1).columns)

# Train and evaluate each model
for name, model in tqdm(models, desc="Evaluating Models"):
    rmse_scores = []  # To store RMSE scores for each fold
    mae_scores = []  # To store MAE scores for each fold
    mape_scores = []  # To store MAPE scores for each fold
    evs_scores = []  # To store Explained Variance Scores for each fold
    fold_importances = []  # To store feature importances for each fold
    
    # K-fold cross-validation
    for train_index, val_index in kf.split(X):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold, y_val_fold = y[train_index], y[val_index]

        # Train the model on the fold
        model.fit(X_train_fold, y_train_fold)
        
        # Predict and reverse the log transformation on validation set
        y_pred = np.expm1(model.predict(X_val_fold))
        y_val_original = np.expm1(y_val_fold)
        
        # Calculate metrics
        rmse_scores.append(np.sqrt(mean_squared_error(y_val_original, y_pred)))
        mae_scores.append(mean_absolute_error(y_val_original, y_pred))
        mape_scores.append(mean_absolute_percentage_error(y_val_original, y_pred))
        evs_scores.append(explained_variance_score(y_val_original, y_pred))

        # Store feature importances if supported
        if hasattr(model, "feature_importances_"):
            fold_importances.append(model.feature_importances_)

    # Store average feature importances for the model
    if fold_importances:
        avg_importances = np.mean(fold_importances, axis=0)
        feature_importances_df[name] = avg_importances
    
    # Print metrics
    print(f"{name} Average RMSE: {np.mean(rmse_scores):.4f}")
    print(f"{name} Average MAE: {np.mean(mae_scores):.4f}")
    print(f"{name} Average MAPE: {np.mean(mape_scores):.4f}")
    print(f"{name} Average Explained Variance Score: {np.mean(evs_scores):.4f}")

    # Final evaluation on the test set
    y_pred_test = np.expm1(model.predict(X_test))
    y_test_original = np.expm1(y_test)

    final_rmse = np.sqrt(mean_squared_error(y_test_original, y_pred_test))
    final_mae = mean_absolute_error(y_test_original, y_pred_test)
    final_mape = mean_absolute_percentage_error(y_test_original, y_pred_test)
    final_evs = explained_variance_score(y_test_original, y_pred_test)
    
    print(f"{name} Test RMSE: {final_rmse:.4f}")
    print(f"{name} Test MAE: {final_mae:.4f}")
    print(f"{name} Test MAPE: {final_mape:.4f}")
    print(f"{name} Test Explained Variance Score: {final_evs:.4f}")

# Display the feature importances
print("\nFeature Importances:")
print(feature_importances_df)


Evaluating Models:   0%|                                                                         | 0/6 [00:00<?, ?it/s]

Evaluating Models:  17%|██████████▊                                                      | 1/6 [00:09<00:48,  9.71s/it]

Decision Tree Average RMSE: 90328.4412
Decision Tree Average MAE: 23245.6504
Decision Tree Average MAPE: 0.1340
Decision Tree Average Explained Variance Score: 0.9717
Decision Tree Test RMSE: 97824.5779
Decision Tree Test MAE: 23715.5808
Decision Tree Test MAPE: 0.1240
Decision Tree Test Explained Variance Score: 0.9664


Evaluating Models:  33%|█████████████████████▋                                           | 2/6 [00:10<00:17,  4.48s/it]

Linear Regression Average RMSE: 17463363341.1013
Linear Regression Average MAE: 204306019.3625
Linear Regression Average MAPE: 46.7686
Linear Regression Average Explained Variance Score: -3663976684.7828
Linear Regression Test RMSE: 876018729.1672
Linear Regression Test MAE: 20542313.9549
Linear Regression Test MAPE: 7.2333
Linear Regression Test Explained Variance Score: -2697914.6723


Evaluating Models:  50%|████████████████████████████████                                | 3/6 [13:35<18:29, 369.80s/it]

Random Forest Average RMSE: 70564.0184
Random Forest Average MAE: 14069.8019
Random Forest Average MAPE: 0.0815
Random Forest Average Explained Variance Score: 0.9823
Random Forest Test RMSE: 83080.2288
Random Forest Test MAE: 15548.5983
Random Forest Test MAPE: 0.0755
Random Forest Test Explained Variance Score: 0.9758


Evaluating Models:  67%|██████████████████████████████████████████▋                     | 4/6 [13:40<07:31, 225.80s/it]

XGBoost Average RMSE: 81604.5157
XGBoost Average MAE: 19850.2240
XGBoost Average MAPE: 0.1095
XGBoost Average Explained Variance Score: 0.9763
XGBoost Test RMSE: 79701.7466
XGBoost Test MAE: 19770.8912
XGBoost Test MAPE: 0.1067
XGBoost Test Explained Variance Score: 0.9777


<catboost.core.CatBoostRegressor at 0x2230a865e20>

<catboost.core.CatBoostRegressor at 0x2230a865e20>

<catboost.core.CatBoostRegressor at 0x2230a865e20>

<catboost.core.CatBoostRegressor at 0x2230a865e20>

<catboost.core.CatBoostRegressor at 0x2230a865e20>

Evaluating Models:  83%|█████████████████████████████████████████████████████▎          | 5/6 [13:56<02:30, 150.13s/it]

CatBoost Average RMSE: 100281.2289
CatBoost Average MAE: 26152.2490
CatBoost Average MAPE: 0.1397
CatBoost Average Explained Variance Score: 0.9644
CatBoost Test RMSE: 109264.8461
CatBoost Test MAE: 28178.7855
CatBoost Test MAPE: 0.1383
CatBoost Test Explained Variance Score: 0.9580
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013945 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7406
[LightGBM] [Info] Number of data points in the train set: 34413, number of used features: 43
[LightGBM] [Info] Start training from score 10.127800


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7441
[LightGBM] [Info] Number of data points in the train set: 34413, number of used features: 43
[LightGBM] [Info] Start training from score 10.136847


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013030 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7440
[LightGBM] [Info] Number of data points in the train set: 34414, number of used features: 43
[LightGBM] [Info] Start training from score 10.125371


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7445
[LightGBM] [Info] Number of data points in the train set: 34414, number of used features: 43
[LightGBM] [Info] Start training from score 10.119707


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012575 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7432
[LightGBM] [Info] Number of data points in the train set: 34414, number of used features: 43
[LightGBM] [Info] Start training from score 10.141017


Evaluating Models: 100%|████████████████████████████████████████████████████████████████| 6/6 [14:02<00:00, 140.46s/it]

LightGBM Average RMSE: 80578.2260
LightGBM Average MAE: 18003.2068
LightGBM Average MAPE: 0.1005
LightGBM Average Explained Variance Score: 0.9769
LightGBM Test RMSE: 90917.6882
LightGBM Test MAE: 17850.1188
LightGBM Test MAPE: 0.0892
LightGBM Test Explained Variance Score: 0.9709

Feature Importances:
                             Decision Tree  Random Forest   XGBoost  \
starting_inventory            8.980785e-05   8.663252e-05  0.000200   
sellin_channel_1              6.796616e-01   6.818438e-01  0.587919   
sellin_channel_2              5.256822e-03   5.056699e-03  0.014686   
sellin_channel_3              9.097115e-03   9.673117e-03  0.031233   
sellin_channel_4              3.591055e-02   3.431413e-02  0.050097   
sellin_channel_5              2.292001e-03   2.170412e-03  0.008611   
sellin_channel_6              5.795920e-02   5.382731e-02  0.081264   
sellin_channel_7              1.530637e-04   1.355204e-04  0.000746   
sellin_channel_8              1.300882e-02   1.295055e-02




#### Feature Engineering

In [52]:
feature_importances_df.columns

Index(['Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'LightGBM'], dtype='object')

In [53]:
columns = ['Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'LightGBM']
feature_importances_df[columns] = feature_importances_df[columns].applymap(lambda x: f"{x:.6f}")


In [54]:
feature_importances_df.sort_values(['Decision Tree', 'Random Forest', 'XGBoost', 'CatBoost', 'LightGBM'],ascending=False,inplace=True)

In [55]:
feature_importances_df.head(12)


Unnamed: 0,Decision Tree,Random Forest,XGBoost,CatBoost,LightGBM
sellin_channel_1,0.679662,0.681844,0.587919,20.51055,369.0
leftover_inventory,0.1642,0.163318,0.098799,34.970056,523.4
sellin_channel_6,0.057959,0.053827,0.081264,13.133283,207.6
sellin_channel_4,0.035911,0.034314,0.050097,8.259635,274.0
sellin_channel_8,0.013009,0.012951,0.030721,3.065652,187.2
sellout,0.01268,0.020087,0.023265,6.982471,337.6
onhand_inventory,0.011842,0.008261,0.01587,0.556714,39.6
sellin_channel_3,0.009097,0.009673,0.031233,4.450404,132.4
sellin_channel_2,0.005257,0.005057,0.014686,1.389268,137.6
sellout_channel_10,0.003068,0.002994,0.009114,3.020445,131.0
