## Action Plan
* Import train data
* Apply Hyper-parameter tuning
* Starting with RandomSearchCV

In [None]:
import pandas as pd
import numpy as np
import pickle
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split

## Random Search CV
* Import data
* Split the data(Stratified K-fold based on outliers)
* Select the set of Hyper-parameters to tune
* Train the model on each set of parameter selected by RandomSearchCV
* Test the model against Validation set
* Check for RMSE in each fold
* Select the best model based on the best performance on validation data

In [None]:
## Importing the data
from google.colab import drive 
drive.mount('/content/gdrive')

hist_train = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_importance_files/X_train_feat_imp.pkl','rb'))
hist_target = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_importance_files/y_target_feat_imp.pkl','rb'))
hist_outliers = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_importance_files/y_outliers_feat_imp.pkl','rb'))
hist_train_data = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_importance_files/train_data_with_all_feat.pkl','rb'))
hist_imp_feat = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_importance_files/boruta_imp_feat.pkl','rb'))


new_train = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_set_4.pkl','rb'))
new_imp_feat = pickle.load(open('gdrive/My Drive/ColabNotebooks/new_trans_boruta_imp_feat.pkl','rb'))


merge_train = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_selected_merge_df/merge_df.pkl','rb'))
merge_imp_feat = pickle.load(open('gdrive/My Drive/ColabNotebooks/feature_selected_merge_df/merge_imp_feat.pkl','rb'))

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
## parameters to hypertune
params = {
    'boosting_type' : ['gbdt','goss','rf'],
    'max_depth' : [i for i in range(1,16,2)],
    'learning_rate' : [10**i for i in range(-4,1)],
    'n_estimators' : [i for i in range(100,600,100)],
    'num_leaves': sp_randint(6, 50), 
    'min_child_samples': sp_randint(100, 500), 
    'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
    'subsample': sp_uniform(loc=0.2, scale=0.8), 
    'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
    'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
    'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
}

In [None]:
def checkRmse(model,X,y_true):
   y_pred = model.predict(X)
   rmse = -1 * np.sqrt(mse(y_true,y_pred))
   print("RMSE : ",rmse)
   return rmse

In [None]:
def performHyperParamTuning(X,y,n_folds):
  print("Creating a LGBM Regressor...")
  lgbm_regressor = lgb.LGBMRegressor(n_jobs=-1,silent=False)
  print("Creating a Randomized Search CV object...")
  random_search_cv = RandomizedSearchCV(lgbm_regressor,
                                        params,
                                        scoring = checkRmse,
                                        n_jobs = -1,
                                        cv = n_folds,
                                        verbose = 20,
                                        random_state = 42)
  print("Fitting with train data...")
  random_search_cv.fit(X,y)
  print("Best Score : ",random_search_cv.best_score_)
  print("Best parameters : ",random_search_cv.best_params_)
  return random_search_cv

## Performing Hyper-parameter tuning on historical transactions data

In [None]:
X_hist = hist_imp_feat.drop(['card_id'],axis=1)
y_hist = hist_target
hist_model = performHyperParamTuning(X_hist,y_hist,5)

Creating a LGBM Regressor...
Creating a Randomized Search CV object...
Fitting with train data...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   25.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   27.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   49.8s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   56.7s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Best Score :  -3.7556381253736206
Best parameters :  {'boosting_type': 'gbdt', 'colsample_bytree': 0.7606690070459252, 'learning_rate': 0.01, 'max_depth': 11, 'min_child_samples': 408, 'min_child_weight': 0.001, 'n_estimators': 400, 'num_leaves': 35, 'reg_alpha': 7, 'reg_lambda': 0.1, 'subsample': 0.3454599737656805}


## Performing Hyper-parameter tuning on new transaction data

In [None]:
new_train.columns = ['new_trans_' + col for col in new_train.columns]
new_train = new_train.drop(['new_trans_card_id'],axis=1)
X_new = new_train[new_imp_feat]
y_new = new_train['new_trans_target']
new_trans_model = performHyperParamTuning(X_new,y_new,5)

Creating a LGBM Regressor...
Creating a Randomized Search CV object...
Fitting with train data...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   47.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   51.9s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:   56.9s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

Best Score :  -3.440357722164735
Best parameters :  {'boosting_type': 'gbdt', 'colsample_bytree': 0.7606690070459252, 'learning_rate': 0.01, 'max_depth': 11, 'min_child_samples': 408, 'min_child_weight': 0.001, 'n_estimators': 400, 'num_leaves': 35, 'reg_alpha': 7, 'reg_lambda': 0.1, 'subsample': 0.3454599737656805}


## Performing Hyper-parameter tuning on Merged data

In [None]:
#merge_train = pd.merge(new_train[['card_id','target']],merge_train,on='card_id',how='inner')
X_merge = merge_train.drop(['card_id','target','outliers'],axis=1)
y_merge = merge_train['target']
merge_best_model = performHyperParamTuning(X_merge,y_merge,5)

Creating a LGBM Regressor...
Creating a Randomized Search CV object...
Fitting with train data...
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   48.8s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   51.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  2

Best Score :  -3.397597884747855
Best parameters :  {'boosting_type': 'gbdt', 'colsample_bytree': 0.7606690070459252, 'learning_rate': 0.01, 'max_depth': 11, 'min_child_samples': 408, 'min_child_weight': 0.001, 'n_estimators': 400, 'num_leaves': 35, 'reg_alpha': 7, 'reg_lambda': 0.1, 'subsample': 0.3454599737656805}


In [None]:
pickle.dump(X_hist,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/X_hist.pkl','wb'))
pickle.dump(X_new,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/X_new.pkl','wb'))
pickle.dump(X_merge,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/X_merge.pkl','wb'))

pickle.dump(y_hist,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/y_hist.pkl','wb'))
pickle.dump(y_new,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/y_new.pkl','wb'))
pickle.dump(y_merge,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/y_merge.pkl','wb'))

pickle.dump(hist_model,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/hist_model.pkl','wb'))
pickle.dump(new_trans_model,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/new_trans_model.pkl','wb'))
pickle.dump(merge_best_model,open('gdrive/My Drive/ColabNotebooks/Hyper_parameter_selected_models/merge_best_model.pkl','wb'))