In [69]:
import pandas as pd


In [70]:
df = pd.read_csv('train.csv', index_col='row_id')
test = pd.read_csv('test.csv', index_col='row_id')
test_row_ids=test.index

df.head()

Unnamed: 0_level_0,date,state,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,01-01-2015,Kerala,ExcelMart,Mec Mug,329.0
1.0,01-01-2015,Kerala,ExcelMart,Mec Hat,520.0
2.0,01-01-2015,Kerala,ExcelMart,Mec Sticker,146.0
3.0,01-01-2015,Kerala,MecStore,Mec Mug,572.0
4.0,01-01-2015,Kerala,MecStore,Mec Hat,911.0


In [71]:
df['date'] = pd.to_datetime(df['date'], format='%d-%m-%Y')
test['date'] = pd.to_datetime(test['date'], format='%d-%m-%Y')

In [72]:
df

Unnamed: 0_level_0,date,state,store,product,num_sold
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,2015-01-01,Kerala,ExcelMart,Mec Mug,329.0
1.0,2015-01-01,Kerala,ExcelMart,Mec Hat,520.0
2.0,2015-01-01,Kerala,ExcelMart,Mec Sticker,146.0
3.0,2015-01-01,Kerala,MecStore,Mec Mug,572.0
4.0,2015-01-01,Kerala,MecStore,Mec Hat,911.0
...,...,...,...,...,...
,NaT,,,,
,NaT,,,,
,NaT,,,,
,NaT,,,,


In [73]:
import numpy as np

In [74]:
df['store'].unique()

array(['ExcelMart', 'MecStore', nan], dtype=object)

In [75]:
df['num_sold']=np.log1p(df['num_sold'])


In [76]:
df.dropna(inplace=True)

In [77]:
def feature_engg(df):
    df['year']=df['date'].dt.year
    df['month']=df['date'].dt.month
    df['week']=df['date'].dt.isocalendar().week     
    df['day']=df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['dayofweek']=df['date'].dt.dayofweek
    df['is_month_start'] = df['date'].dt.is_month_start
    df['is_month_end'] = df['date'].dt.is_month_end
    df['day_of_year'] = df['date'].dt.day_of_year
    df['week_of_year']=df['date'].dt.isocalendar().week

    df.drop('date', axis=1, inplace=True)
    
    return df

In [78]:
df=feature_engg(df)


In [79]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder


In [80]:
test=feature_engg(test)

In [81]:
from sklearn.preprocessing import FunctionTransformer
def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))
def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

In [82]:
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [83]:
X=df.drop("num_sold", axis=1)
y=df['num_sold']

In [84]:
from sklearn.model_selection import TimeSeriesSplit
ts_cv = TimeSeriesSplit(
    n_splits=5,
    
    max_train_size=20000
    
)

In [85]:
all_splits = list(ts_cv.split(X, y))
train_0, test_0 = all_splits[0]

In [86]:
train_4, test_4=all_splits[4]

In [87]:
X.iloc[train_4].shape

(18581, 13)

In [88]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import  MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_validate, GridSearchCV
categorical_columns = [
    "state",
    "store",
    "product"
    ]
categories = [
    ['Kerala', 'Mumbai', 'Delhi'],
    ['ExcelMart', 'MecStore'],
    ['Mec Mug', 'Mec Hat', 'Mec Sticker']
    ]

ordinal_encoder = OrdinalEncoder(categories=categories)
ohe=OneHotEncoder()
gbrt_pipeline = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("categorical", ordinal_encoder, categorical_columns),
             ("month_sin", sin_transformer(12), ["month"]),
            ("month_cos", cos_transformer(12), ["month"]),("day_sin", sin_transformer(7), ["day"]),
            ("day_cos", cos_transformer(7), ["day"]),
            ("week_of_yr_sin", sin_transformer(52), ["week_of_year"]),
            ("week_of_yr_cos", cos_transformer(52), ["week_of_year"]),
            
        ],
        remainder=MinMaxScaler(),
    )
    ,
    HistGradientBoostingRegressor(random_state=101
    ),
)

In [89]:
param_grid={"histgradientboostingregressor__max_depth":[5],
            "histgradientboostingregressor__min_samples_leaf": [45],
    
}

In [90]:
search=GridSearchCV(gbrt_pipeline, param_grid, n_jobs=-1, cv=ts_cv, refit=True)


In [91]:
search.fit(X, y)


In [92]:
 search.best_score_

0.9653458474800946

In [93]:
y_pred=search.predict(X)

In [94]:
y_pred=np.exp(y_pred).astype('int')

In [95]:
smape(y_pred=y_pred, y_true=np.exp(y).astype('int'))

4.858447252591099

In [96]:
y_test=search.predict(test)
y_test=np.exp(y_test).astype('int') 

In [97]:
y_test=pd.DataFrame(y_test).rename(columns={0:'num_sold'}).astype('int')

In [98]:
submission =pd.DataFrame({'row_id': test_row_ids,
                           'num_sold': y_test['num_sold']})  
submission.to_csv('final_submission.csv', index=False)