In [None]:
import catboost as cb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import optuna
import warnings
from warnings import simplefilter
import shap

%reload_ext autoreload
%matplotlib inline

warnings.filterwarnings("ignore")
warnings.simplefilter("ignore", category=RuntimeWarning)

### Load Data

In [None]:
train_data = pd.read_csv("train_preprocessed.csv", dtype={'Store': object, 
                                                          'ts_promo':object, 
                                                          'store_promo':object, 
                                                          'dom_promo':object, 
                                                          'dow_promo':object})

test_data = pd.read_csv("test_preprocessed.csv", dtype={'Store': object,
                                                        'ts_promo':object, 
                                                        'store_promo':object, 
                                                        'dom_promo':object, 
                                                        'dow_promo':object})

### Small Data Preparation

In [None]:
obj_features = list(train_data.loc[:, train_data.dtypes == 'object'].columns.values)
obj_features_test = list(test_data.loc[:, test_data.dtypes == 'object'].columns.values)

for feature in obj_features:
    train_data[feature] = pd.Series(train_data[feature], dtype='category')

for feature in obj_features_test:
    test_data[feature] = pd.Series(test_data[feature], dtype='category')

train_data['Week'] = pd.Series(train_data['Week'], dtype='category')  
test_data['Week'] = pd.Series(test_data['Week'], dtype='category')

train_data = train_data.sort_values(by=['Date'])
test_data = test_data.sort_values(by=['Date'])

X = train_data.drop(['Sales'], axis=1)
y = train_data['Sales']

cat_feat = list(X.loc[:, train_data.dtypes == 'category'].columns)
print(cat_feat)

## Model - Catboost Optuna Hyperparameters Optimization

In [None]:
print('Starting model...')

def objective(trial):  
    
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 10000),
        'max_depth': trial.suggest_int('max_depth', 2, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 0.5),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 0.1, 20)
    }
       
    model = cb.CatBoostRegressor(**params, cat_features=cat_feat) 
           
    tss = TimeSeriesSplit(n_splits=5)
    for train_index, test_index in tss.split(X):
        train_x, test_x = X.iloc[train_index, :], X.iloc[test_index,:]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]
        
    eval_dataset = [(test_x, test_y)]
        
    model.fit(train_x,train_y,eval_set=eval_dataset,early_stopping_rounds=100,verbose=False)

    preds = model.predict(test_x)

    rmse = mean_squared_error(test_y, preds,squared=False)
    
    return rmse

study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=10)

print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

## Submissions

#### 1st run -


In [None]:
#X_test = test_data.drop(['id'], axis=1)

#print('Starting model...')

#params =  {}

#model = CatBoostRegressor(**params)  

#model.fit(X,y,verbose=False)

#preds = model.predict(X_test)

#test_data['Sales'] = preds
#test_data = test_data.sort_values(by=['id'])
#test_data[['id','Sales']].to_csv('sub_CB.csv', index=False)
#print('Submission Generated.')

## Model - Catboost

In [None]:
#print('Starting model...')

#tss = TimeSeriesSplit(n_splits=5)
#for train_index, test_index in tss.split(X):
#    train_x, test_x = X.iloc[train_index, :], X.iloc[test_index,:]
#    train_y, test_y = y.iloc[train_index], y.iloc[test_index]

#    params =  {}

#    model = CatBoostRegressor(**params)  
#    print('Start fitting...')
#    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=500,verbose=False)
#    print('Start predicting...')
#    preds = model.predict(test_x)

#   rmse = mean_squared_error(test_y, preds,squared=False)

#  print(rmse)

### Plotting & Analysis

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_contour(study, params=
        ['learning_rate', 'max_depth', 'l2_leaf_reg', 'n_estimators'])


In [None]:
optuna.visualization.plot_slice(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
#print('Started Shap Explainer')
#explainer = shap.TreeExplainer(ml)
#print('Started Shap Values Computation')
#shap_values = explainer.shap_values(X_train)
#print('Started Plotting') 
#shap.summary_plot(shap_values, X_train)