In [None]:
%%time 

import warnings
warnings.filterwarnings("ignore");
import os;
import pandas as pd;
import numpy as np;
import random;
 
from sklearn.preprocessing import LabelEncoder,SplineTransformer;
from sklearn.utils import shuffle;
from sklearn.model_selection import TimeSeriesSplit;
from sklearn.pipeline import Pipeline;
from catboost import CatBoostRegressor;
from sklearn.metrics import mean_squared_error, make_scorer;

from colorama import Fore, Back, Style;
bluc = Fore.BLUE + Style.BRIGHT;
redc = Fore.RED + Style.BRIGHT;
gldc = Fore.YELLOW + Style.BRIGHT;
rstc = Style.RESET_ALL;

from gc import collect;
from pprint import pprint;
from tqdm.notebook import tqdm;

import matplotlib.pyplot as plt;
import seaborn as sns;
%matplotlib inline

collect();
print();

# **INTRODUCTION**

In [None]:
%%time

# Defining model variables:-

n_reapts = 200;
n_splits = 4;
random.seed(42);
post_process_mthd = 'mult' # other optins - add, NA;
post_process_fct = 1.71;

random_state_list = random.sample(range(9999), n_reapts);
pprint(np.array(random_state_list), depth = 1, width = 100, indent = 10);

print();
collect();

In [None]:
%%time 

# Defining the competition metric:-
def smape(y_true, y_pred):
    return 1 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)) * 100);

smape_scorer = make_scorer(smape, greater_is_better=False);

In [None]:
%%time 

file_names = []
directory = '/kaggle/input' 
for dirpath, dirnames, filenames in os.walk(directory):
    for filename in filenames:
        file_path = os.path.join(dirpath, filename)
        file_name = os.path.splitext(os.path.basename(file_path))[0].split('_')[-1]
        globals()[file_name] = pd.read_csv(file_path)
        print(file_name)
        
print();
collect();

# **DATA PROCESSING**

In [None]:
%%time 

# Creating date-time features:-
def MakeDateFtre(X:pd.DataFrame):
    """
    This function creates datetime and sinusoidal features from the date column
    Input:- X- pd.DataFrame
    Returns:- Date and sinusiodal feature adjusted dataframe
    """;
    
    df_temp = X.copy(deep = True);
    
    df_temp['month']      = df_temp['date'].dt.month
    df_temp['day']        = df_temp['date'].dt.day
    df_temp['year']       = df_temp['date'].dt.year
    df_temp['dayofweek']  = df_temp['date'].dt.dayofweek
    df_temp['quarter']    = df_temp['date'].dt.quarter
    df_temp['dayofmonth'] = df_temp['date'].dt.day
    df_temp['weekofyear'] = df_temp['date'].dt.weekofyear
    df_temp['friday']     = df_temp.date.dt.weekday.eq(4).astype(np.uint8)
    df_temp['saturday']   = df_temp.date.dt.weekday.eq(5).astype(np.uint8)
    df_temp['sunday']     = df_temp.date.dt.weekday.eq(6).astype(np.uint8)

    df_temp['month_sin']  = np.sin(2*np.pi*df_temp.month/12)
    df_temp['month_cos']  = np.cos(2*np.pi*df_temp.month/12)
    df_temp['day_sin']    = np.sin(2*np.pi*df_temp.day/24)
    df_temp['day_cos']    = np.cos(2*np.pi*df_temp.day/24)
    
    return df_temp;

def SplineXformer(period, n_splines=None, degree=3):
    """
    Reference: https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html
    """
    
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True)

def MakeSplineFeature(hours=np.arange(1,32)):
    """
    This function creates spline features using the spline transformer
    """;
     
    hour_df    = pd.DataFrame(np.linspace(1, 32, 32).reshape(-1, 1),columns=["day"]);
    splines    = SplineXformer(32, n_splines=4).fit_transform(hour_df);
    splines_df = pd.DataFrame(splines,columns=[f"spline_{i}" for i in range(splines.shape[1])]);
    splines_df =pd.concat([pd.Series(hours,name='day'), splines_df], axis="columns");
    return splines_df;

print();
collect();


In [None]:
%%time 

# Encoding category columns:-
le   = LabelEncoder();
cols = ['country', 'store', 'product'];

for col in cols:
    train[col] = le.fit_transform(train[col]);
    test[col]  = le.transform(test[col]);
    
train["date"] = pd.to_datetime(train["date"]);
test["date"]  = pd.to_datetime(test["date"]);

# Implementing the data processing:-
train = MakeDateFtre(train);
test  = MakeDateFtre(test);

train.set_index("date",inplace = True);
test.set_index("date", inplace = True);

train["type"] = "train";
test["type"]  = "test";
df            = pd.concat([train,test]);
splines_df    = MakeSplineFeature();
df            = df.merge(splines_df,on='day',how='left');
train         = df[df["type"]=="train"]
test          = df[df["type"]=="test"]
train.drop(columns=["type"], inplace=True);
test.drop(columns=["type", 'num_sold'],inplace=True, errors = 'ignore');

col= "num_sold"
X  = train.drop(columns=[col]);
y  = train[col];

print();
collect();

# **MODEL DEFINTION**

In [None]:
# Pipeline

params={'n_estimators'     : 200, 
        'learning_rate'    : 0.0775, 
        'depth'            : 7,
        'l2_leaf_reg'      : 8.125, 
        'subsample'        : 0.43, 
        'colsample_bylevel': 0.675,
        "random_state"     : 42
       };

model    = CatBoostRegressor(**params, verbose=0);
pipeline = Pipeline(steps= [("M", model)]);
display(pipeline);

# **MODEL TRAINING**

In [None]:
%%time 

scores_smape = []
scores_mse   = []
test_preds   = np.zeros(len(test));

print(Fore.MAGENTA + f"\n{'-' * 30} MODEL TRAINING {'-' * 30}\n" + rstc);

for state, random_state in tqdm(enumerate(random_state_list), "Model training"):
    tscv = TimeSeriesSplit(n_splits = n_splits, test_size = 27375, max_train_size = 82200, gap=0);
    
    for fold, (train_index, val_index) in enumerate(tscv.split(X, y)):
        
        #  KEY NOTE:- ORIGINAL WORK HAS AN ERROR HERE:-       
        X_train, X_test = X.iloc[train_index], X.iloc[val_index];
        y_train, y_test = y.iloc[train_index], y.iloc[val_index];
        
        # Pipeline fit
        X_train,y_train = shuffle(X_train,y_train, random_state = random_state);
        X_test, y_test  = shuffle(X_test, y_test,  random_state = random_state);
        pipeline.fit(X_train, y_train);
        y_pred = pipeline.predict(X_test);

        # SMAPE OOF score
        score_smape = smape(y_test, y_pred)
        score_mse = mean_squared_error(y_test, y_pred)
        scores_smape.append(score_smape)
        scores_mse.append(score_mse)        
        fold_preds = pipeline.predict(test)
        test_preds += fold_preds
        
    # Mean score:-
    mean_smape_score = np.mean(scores_smape);
    if state <= 9: num_space = 5;
    elif state <= 99: num_space = 4;
    else: num_space = 3;
    print(bluc + f"State {state} {'-' * num_space} OOF Score Mean = {mean_smape_score :.5f}" + rstc);
    del num_space;
    
print();
collect();

# **SUBMISSION**

In [None]:
%%time 

submission.columns     = ["id","num_sold"];

if post_process_mthd == "mult":
    y_pred = (test_preds/ (n_splits * n_reapts)) * post_process_fct;
elif post_process_mthd == "add":
    y_pred = (test_preds/ (n_splits * n_reapts)) + post_process_fct;
else:
    y_pred = (test_preds/ (n_splits * n_reapts));
    
y_pred = y_pred.astype(int);
y_pred = np.where(y_pred < 0, 1, y_pred);
submission["num_sold"] = y_pred;

# Plotting the train-test predictions:-
df = \
pd.DataFrame(pd.to_datetime(dict(year= train.year, month= train.month, day= train.day)), 
             columns = ['date']
            ).\
assign(num_sold = train['num_sold'].values, source = "Train").\
append(pd.DataFrame(pd.to_datetime(dict(year= test.year, month= test.month, day= test.day)), 
             columns = ['date']).\
       assign(num_sold = y_pred, source = 'Test')
      );

fig, ax = plt.subplots(1,1, figsize = (18, 6));
sns.lineplot(data = df, x = df['date'], y = df['num_sold'], 
             ax = ax, hue = "source", palette = ['blue', 'red'],
            );
ax.set_title(f"\nTrain-test predictions after post-processing\n", 
             fontsize = 12, fontweight = 'bold', color = 'tab:blue',
            );
ax.grid(which = 'both', visible = True, linestyle = '--', linewidth = 0.5,color = 'lightgrey');
ax.legend(bbox_to_anchor = (1,0.5));
ax.set(xlabel = '', ylabel = '');

plt.tight_layout();
plt.show();

# Saving the csv file:-
submission.to_csv("submission.csv",index = False);
display(submission.head(10));
del df;

collect();
print();

# **OUTRO**

**Sources**<br>
1. https://www.kaggle.com/code/yaaangzhou/top19-lb-27-994-catboost-model/notebook<br>
2. https://www.kaggle.com/code/chingiznurzhanov/timeseriessplit-catboost-trick <br>
3. https://www.kaggle.com/code/onurkoc83/timeseriessplit-catboost <br>

**My contribution**<br>
1. Corrected a mistake in the CV indices (please peruse the model training section in mine and original work and note the **test_index**)<br>
2. Added comments and re-organized the code<br>
3. Added post-processing elements with plots<br>

**If you like my work, please support it with a thumbs up**<br>
**Best regards!!**