In [1]:
import numpy as np
import pandas as pd
import seaborn as sns


import matplotlib
from matplotlib import pyplot as plt
from textwrap import wrap


pd.options.display.float_format = '{:,.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.set_option('display.max_colwidth', None)

import pickle
import time

import lightgbm as lgb
import xgboost as xgb

from itertools import product

#import googletrans
#from googletrans import Translator
#to avoid instantiate too many translator, we created one instance for the whole notebook
#translator = Translator()

random_key=42

import lightgbm as lgb

from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

#from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
#from sklearn.dummy import DummyClassifier

from imblearn.pipeline import Pipeline 

from sklearn.ensemble import RandomForestClassifier



In [2]:
# The function will:
# 1. run the input GridSearchCV
# 2. compute and compare the roc_auc_score, F1 score for training data and testing data
# 3. get and print the confusion matrix
# 4. plot the confusion matrix and P-R curve
# 5. plot feature importance list

def run_model(gs, model_name, X_train, y_train, X_test, y_test):
    
    # run the input GridSearchCV
    gs.fit(X_train, y_train)
    
    # print(gs.best_params_)
    if isinstance(gs, GridSearchCV):
        print("best_params: ")
        for (item, value) in gs.best_params_.items():
            print('   ',item, ' : ', value)
    
    #compute and compare the auc_score for training data and testing data
    train_auc_score = gs.score(X_train, y_train)
    test_auc_score = gs.score(X_test, y_test)
    auc_perc_diff = np.abs(test_auc_score/train_auc_score*100-100)
    
    print(f'\nprecision_recall_auc_score on training set: {train_auc_score:.3f}')
    print(f'precision_recall_auc_score on testing set: {test_auc_score:.3f}')
    print(f'perc_diff: {auc_perc_diff:.1f} %\n')
    
    # Get predictions
    y_train_preds = gs.predict(X_train) 
    y_test_preds = gs.predict(X_test)
    
    # calculate the F1_score
    train_f1_score =  f1_score(y_train, y_train_preds)
    test_f1_score = f1_score(y_test, y_test_preds)
    f1_perc_diff = np.abs(test_f1_score/train_f1_score*100-100)
    print(f'f1_score on training set: {train_f1_score:.3f}')
    print(f'f1_score on testing set: {test_f1_score:.3f}')
    print(f'perc_diff: {f1_perc_diff:.1f} %\n')
    
    # Save confusion matrix values
    print("Confusion Matrix: ")
    cm = confusion_matrix(y_test, y_test_preds)
    cm_df = pd.DataFrame(cm, columns=['pred No virus', 'pred WnvPresent'], 
                         index=['actual No virus', 'actual WnvPresent'])
    print(cm_df.head())                 

    print("\n Plot Confusion Matrix and PR-AUC curve")
    fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=[15,6])
    
    font = {'weight' : 'bold',
            'size'   : 16}
    plt.rc('font', **font)
    
    # View confusion matrix
    ConfusionMatrixDisplay.from_estimator(gs, X_test, y_test, ax=ax1)
    ax1.set_title("Confusion Matrix")
    
    # plot Precision-Recall curve
    PrecisionRecallDisplay.from_estimator(gs, X_test, y_test, ax=ax2)
    ax2.set_xticks(np.arange(0, 1, 20))
    ax2.set_title("Precision-Recall Curve")
    ax2.legend(loc='upper right', fontsize=14)

    # only plot the important feature list if the model is instance of (gridsearchcv)
    if isinstance(gs,GridSearchCV):
        # plot featureimportances
        model = gs.best_estimator_.named_steps["classifier"]
        importances = model.feature_importances_
        indices = np.argsort(importances)
        fig,ax=plt.subplots(1,figsize=(10,24))
        features = X_train.columns
        plt.title('Feature Importance')
        ax.barh(range(len(indices)), importances[indices], color='b', align='center')
        plt.yticks(range(len(indices)), [features[i] for i in indices])
        plt.xlabel('Relative Importance')
        plt.show()
    
    return pd.DataFrame({'Precision Recall AUC Train': train_auc_score,
                         'Precision Recall AUC Test': test_auc_score,
                         'Precision Recall AUC Generalization %': auc_perc_diff,
                         'F1 Score Train': train_f1_score,
                         'F1 Score Test': test_f1_score,
                         'F1 Score Generalization %' : f1_perc_diff
                        }, index=[model_name])

# import data from pickle file

In [3]:
#read pickle_dict, which store everything

pickle_dict1 = pickle.load(open('../temp/data1.pkl', 'rb'))
df_all = pickle_dict1['df_all']   #concat df_train_m and df_test
df_train = pickle_dict1['df_train']
df_test = pickle_dict1['df_test']
df_items = pickle_dict1['df_items']
df_shops = pickle_dict1['df_shops']
df_cat = pickle_dict1['df_cat']
new_arrival_item_list = pickle_dict1['new_arrival_item_list']


pickle_dict2 = pickle.load(open('../temp/data2.pkl', 'rb'))
df_basegrid = pickle_dict2['df_basegrid']   #concat df_train_m and df_test


In [4]:
df_basegrid.shape

(2946018, 10)

In [5]:
df_basegrid.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_price,item_cnt_month,is_new_arrival,month,year,year_month,item_category_id
0,2,5572,0,1533.0,9,0,1,2013,2013-01-01,2
1,2,5643,0,2388.0,0,0,1,2013,2013-01-01,2
2,2,5583,0,594.0,2,0,1,2013,2013-01-01,5
3,2,7893,0,1990.0,3,0,1,2013,2013-01-01,6
4,2,7894,0,1490.0,1,0,1,2013,2013-01-01,6


In [6]:
df_items.head()

Unnamed: 0,item_name,item_id,item_category_id
0,! IN THE POWER OF GLAMOR (PLAST.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full [PC Digital],1,76
2,*** IN THE RAYS OF GLORY (UNV) D,2,40
3,***BLUE WAVE (Univ) D,3,40
4,***BOX (GLASS) D,4,40


# Feature Engineering

In [7]:
df_temp = df_basegrid

In [8]:
#feature engineerinf flag, it controls which featurs to be added to final feature list
fe_flag=[ 0, #0, item name
          0, #1, item category name
          0, #2, shop name
          0, #3, lagged item_cnt_month
          0, #4, price, and lagged price
          
]

#hold the features to be added to final selection
dict_final_selected_features={}

In [9]:
############################ 0 item name feature engineering
##################################################################
if fe_flag[0] == 1:
    list_feature_item_name =[]
    df_temp = df_basegrid.merge(df_items, on=['item_id', 'item_category_id'])
    list_feature_item_name.append('item_name')
    print(list_feature_item_name)

    dict_final_selected_features[0]=list_feature_item_name


In [10]:
############################## 1 item category feature engineering
#####################################################################
if fe_flag[1] == 1:
    list_feature_cat_anme =[]
    df_temp =df_temp.merge(df_cat, on='item_category_id')
    list_feature_cat_anme =['item_category_name']
    dict_final_selected_features[1]=list_feature_cat_anme

In [11]:
########################## 2 shop feature engineering
###################################################################
if fe_flag[2] == 1:
    list_feature_shop_name=[]
    df_temp =df_temp.merge(df_shops, on='shop_id')
    list_feature_shop_name=['shop_name']
    dict_final_selected_features[2]=list_feature_shop_name

In [12]:
############################# 3 lagged item_cnt_month
#####################################################################
list_feature_icm=[]


In [13]:
############################### 4 price feature engineering
######################################################################
list_feature_item_price=[]


In [14]:
############################# time feature engineering
######################################################################
list_feature_time=[]

#month?

# data pre-preparation

In [15]:
df_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2946018 entries, 0 to 2946017
Data columns (total 10 columns):
 #   Column            Dtype         
---  ------            -----         
 0   shop_id           int8          
 1   item_id           int16         
 2   date_block_num    int8          
 3   item_price        float16       
 4   item_cnt_month    int16         
 5   is_new_arrival    int8          
 6   month             int8          
 7   year              int16         
 8   year_month        datetime64[ns]
 9   item_category_id  int8          
dtypes: datetime64[ns](1), float16(1), int16(3), int8(5)
memory usage: 81.5 MB


In [16]:
#list_feature_base = ['item_id','shop_id', 'date_block_num',
#                     'month','year','is_new_arrival', 'item_category_id'
#                    ]

list_feature_base = ['shop_id','item_id', 'item_category_id','date_block_num',
                     #'month','year','is_new_arrival', 
                     'item_cnt_month'
                 ]

In [17]:
list_final_selected_features = []

for i in range(len(fe_flag)):
    if fe_flag[i]==1:
        list_final_selected_features +=  dict_final_selected_features[i]
        

print(list_final_selected_features)


[]


In [18]:
list_feature_final = list_feature_base + list_final_selected_features
print(list_feature_final)

['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_cnt_month']


In [19]:
df_final = df_temp[list_feature_final]
#y_final = df_temp['item_cnt_month']

In [20]:
validation_date_block_num= 33
test_date_block_num = 34

In [21]:
index_train = (df_final['date_block_num'] < validation_date_block_num)
df_train_final = df_final[index_train]

index_validation = (df_final['date_block_num'] == validation_date_block_num)
df_validation_final = df_final[index_validation]


index_test = (df_final['date_block_num'] == test_date_block_num)
df_test_final = df_final[index_test]



X_train =  df_train_final.iloc[:, 0:-1]
y_train =  df_train_final.iloc[:, -1]


X_validation = df_validation_final.iloc[:, 0:-1]
y_validation =  df_validation_final.iloc[:, -1]

X_test =  df_test_final.iloc[:, 0:-1]
y_test =  df_test_final.iloc[:, -1]


### Baseline Model : Previous Month Sale
- score: 1.16777

In [47]:
################# Baseline benchmark: Previous Value 
####################################################

df_valid_test = df_test.merge(df_validation_final,on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]

#score: 1.16777
df_valid_test['item_cnt_month'] = df_valid_test.item_cnt_month.fillna(0).clip(0,20)
submission = df_valid_test.set_index('ID')
submission.to_csv('benchmark.csv')


#### Model: XGB + ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_cnt_month']

- score: 1.16655

In [48]:
%%time
model_xgb = xgb.XGBRegressor(max_depth = 10, min_child_weight=0.5, 
                         subsample = 1, eta = 0.3, seed = 1,eval_metric='rmse')
model_xgb.fit(X_train, y_train)


y_validation_preds = model_xgb.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_preds, squared=False)
print("rmse = ", rmse)



rmse =  2.056363
Wall time: 2min 8s


#### Model: LGBM + ['shop_id', 'item_id', 'item_category_id', 'date_block_num', 'item_cnt_month']

- Score: 1.12633

In [22]:
%%time
model_lgb =lgb.LGBMRegressor(random_state=random_key)

model_lgb.fit(X_train, y_train)


y_validation_preds = model_lgb.predict(X_validation)
rmse = mean_squared_error(y_validation, y_validation_preds, squared=False)
print("rmse = ", rmse)



rmse =  2.0748607706487507
Wall time: 4.13 s


In [56]:
features = ['shop_id', 'item_id', 'item_category_id', 'date_block_num']
df_feature_importance=pd.DataFrame({'Value':model_lgb.feature_importances_,'Feature':features}).sort_values(by="Value",ascending=False)

In [58]:
df_feature_importance.head(100)

Unnamed: 0,Value,Feature
0,885,shop_id
3,779,date_block_num
2,694,item_category_id
1,642,item_id


# production model and make predictions

In [26]:
production_model = model_lgb




# pickle test_preds for model reconciliation

In [29]:
df_test_preds = df_test_final
df_test_preds['item_cnt_month'] = production_model.predict(X_test)

pickle_dict_preds = dict()
pickle_dict_preds['df_test_preds']=df_test_preds
pickle.dump(pickle_dict_preds, open(f'../temp/df_test_preds.pkl', 'wb'))

pickle_dict_preds['df_test_preds'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_preds['item_cnt_month'] = production_model.predict(X_test)


Unnamed: 0,shop_id,item_id,item_category_id,date_block_num,item_cnt_month
2731818,2,5441,0,34,0.36
2731819,2,5643,2,34,0.41
2731820,2,13411,2,34,0.29
2731821,2,5634,2,34,0.41
2731822,2,5638,2,34,0.41


# Submission

In [53]:

#calculate y_test and submit
df_test_final['item_cnt_month'] = model_lgb.predict(X_test).clip(0,20)






df_submission = df_test_final.merge(df_test, on=['shop_id', 'item_id'])[["ID", "item_cnt_month"]]
df_submission.set_index('ID',drop=False, inplace=True)
df_submission.sort_index(inplace=True)

df_submission.to_csv("../datasets/submission.csv", index=False)
df_submission.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_final['item_cnt_month'] = model_lgb.predict(X_test).clip(0,20)


Unnamed: 0_level_0,ID,item_cnt_month
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0.51
1,1,0.09
2,2,0.51
3,3,0.35
4,4,0.58


# define pipeline
model = RandomForestClassifier(random_state=random_key)



steps = [('classifier', model)]
pipe = Pipeline(steps=steps)


# pipeline parameters
pipe_params = { 'classifier__num_leaves' : [40],   # The number of trees in the forest.
                'classifier__learning_rate' : [.01],   # The number of trees in the forest.
                'classifier__max_depth' : [4],              # The maximum depth of the tree.
               # 'classifier__is_unbalance': ['True','False'],
               # 'classifier__metric' : ['auc', 'binary_logloss','average_precision'] # Different metrics
                }

# Instantiate GridSearchCV.
# tscv = TimeSeriesSplit(n_splits=3)
gs_smlgb = GridSearchCV(pipe, # what object are we optimizing?
                    param_grid = pipe_params, # what parameters values are we searching?
                    cv=1, # 3-fold cross-validation.
                    n_jobs=-1, 
                    scoring='neg_root_mean_squared_error'
                    )

#finding the best hypermeter list for the model
result = run_model(gs_smlgb, "LightGBM",  X_train, y_train, X_validation, y_validation
                  )
#result_list.append(result)

In [None]:
# define pipeline
model = lgb.LGBMClassifier(objective='regression', random_state=random_key)



steps = [('classifier', model)]
pipe = Pipeline(steps=steps)


# pipeline parameters
pipe_params = { 'classifier__num_leaves' : [40],   # The number of trees in the forest.
                'classifier__learning_rate' : [.01],   # The number of trees in the forest.
                'classifier__max_depth' : [4],              # The maximum depth of the tree.
               # 'classifier__is_unbalance': ['True','False'],
               # 'classifier__metric' : ['auc', 'binary_logloss','average_precision'] # Different metrics
                }

# Instantiate GridSearchCV.
# tscv = TimeSeriesSplit(n_splits=3)
gs_smlgb = GridSearchCV(pipe, # what object are we optimizing?
                    param_grid = pipe_params, # what parameters values are we searching?
                    cv=2, # 3-fold cross-validation.
                    n_jobs=-1, 
                    scoring='neg_root_mean_squared_error'
                    )

#finding the best hypermeter list for the model
result = run_model(gs_smlgb, "LightGBM",  X_train, y_train, X_validation, y_validation
                  )
#result_list.append(result)

print('Fitting...')
model = xgb.XGBRegressor(max_depth = 11, min_child_weight=0.5, subsample = 1, eta = 0.3, num_round = 1000, seed = 1, nthread = 16)
model.fit( X_train, y_train, eval_metric='rmse')
