# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
#from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier
import scipy as sp


## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

### Feature engineering.
Add 3 features : 
- time_mean_user
- time_total_user
- total_action_user


In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

## Cross validation
5 folds cross validation, using ndcg as scoring metric.


In [7]:
#X_train = X_train[100000:100100]
#y_labels = y_labels[100000:100100]
#X_test = X_test[10000:60000]

# Split train dataset into 5 folds 
cv = model_selection.KFold(n_splits=5, random_state=None)

## Machine Learning 
First several algorithms are tried, and optimized through Cross validation and Grid Search. The code is here optimized to run on 3 processors at the same time, as it is very long. See here examples on MacBook Pro 4 cpu, RAM 16GB
Computational time
GridsearchCrossValidation:

- 5000 data
    Random Forest : 27 fits 10,1s
    XGB : 36 fits 60s
    
- 10000 data
    Random Forest : 27 fits 22 s
    XGB : 36 fits 140s
    
- 50000 data
    Random Forest : 27 fits 342
    XGB : 36 fits 1100s

Our final model is composed of a voting classifier composed of the previous models optimized.


Models that were tried:
- **Random Forest** with the following parameters:

    - 'max_depth': [ 4, 6, 8]
    - 'n_estimators': [ 50, 100, 150]


- **eXtreme Gradient Boosting XCGB**:
    - 'max_depth': [6,8,10],
    - 'learning_rate': [0.3],
    - 'n_estimators': [10,15,20,25],
    - 'objective': ['multi:softprob'],
    - 'gamma': [0],
    - 'subsample': [0.5],
    - 'colsample_bytree': [0.5],
    - 'seed': [0]

- Voting classifer:
    - Soft 
    
The metric used is the nDCG.

## Model 1 : RandomForest

Grid Search to find best parameter.

In [8]:
X_train_sparse = sp.sparse.csr_matrix(X_train.values)

In [9]:
number_trees = [ 50, 100, 125, 150, 200, 300, 400, 500, 600, 700]
max_depth = [6, 8, 10, 12, 14, 16, 20, 22]


rf_score_trees = []
rf_score_depth = []
rf_param_trees = []
rf_param_depth = []

#Loop for 1st hyperparameter n_estimators
for number_trees_idx, number_trees_value in enumerate(number_trees):
    
    print('number_trees_idx: ',number_trees_idx+1,'/',len(number_trees),', value: ', number_trees_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=number_trees_value, max_depth=14)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_trees.append(scores.mean())
    rf_param_trees.append(number_trees_value)
    print('Mean NDCG for this number_trees = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_trees))
print('best parameter num_trees:')
idx_best = np.argmax(rf_score_trees)
best_num_trees = rf_param_trees[idx_best]
print(best_num_trees)

number_trees_idx:  1 / 10 , value:  50
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.777909, total= 1.4min
[CV] ................................. , score=0.831268, total= 1.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  1.5min remaining:  2.2min


[CV] ................................. , score=0.842008, total= 1.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  1.6min remaining:  1.1min


[CV] ................................. , score=0.833026, total= 1.7min
[CV] ................................. , score=0.811310, total= 1.7min
Mean NDCG for this number_trees =  0.819104173364
number_trees_idx:  2 / 10 , value:  100


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  1.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  1.7min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.778842, total= 2.4min
[CV] ................................. , score=0.832232, total= 2.4min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  2.4min remaining:  3.7min


[CV] ................................. , score=0.841025, total= 2.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  2.6min remaining:  1.7min


[CV] ................................. , score=0.831988, total= 2.7min
[CV] ................................. , score=0.811423, total= 2.7min
Mean NDCG for this number_trees =  0.819102124937
number_trees_idx:  3 / 10 , value:  125


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.7min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779809, total= 2.9min
[CV] ................................. , score=0.832256, total= 3.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  3.0min remaining:  4.5min


[CV] ................................. , score=0.841894, total= 3.1min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  3.1min remaining:  2.1min


[CV] ................................. , score=0.832281, total= 3.2min
[CV] ................................. , score=0.811400, total= 3.2min
Mean NDCG for this number_trees =  0.819528040116
number_trees_idx:  4 / 10 , value:  150


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.2min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779745, total= 3.4min
[CV] ................................. , score=0.832238, total= 3.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  3.5min remaining:  5.3min


[CV] ................................. , score=0.842500, total= 3.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  3.6min remaining:  2.4min


[CV] ................................. , score=0.831737, total= 3.7min
[CV] ................................. , score=0.811626, total= 3.8min
Mean NDCG for this number_trees =  0.819569226263
number_trees_idx:  5 / 10 , value:  200


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.8min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.8min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779984, total= 4.4min
[CV] ................................. , score=0.832073, total= 4.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  4.5min remaining:  6.8min


[CV] ................................. , score=0.843092, total= 4.7min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  4.7min remaining:  3.1min


[CV] ................................. , score=0.831667, total= 4.8min
[CV] ................................. , score=0.812168, total= 4.9min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.9min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.9min finished


Mean NDCG for this number_trees =  0.819796880111
number_trees_idx:  6 / 10 , value:  300
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.832798, total= 6.5min
[CV] ................................. , score=0.779705, total= 6.6min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  6.6min remaining:  9.9min


[CV] ................................. , score=0.843440, total= 6.9min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  6.9min remaining:  4.6min


[CV] ................................. , score=0.831769, total= 7.0min
[CV] ................................. , score=0.812716, total= 7.2min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.2min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.2min finished


Mean NDCG for this number_trees =  0.820085744256
number_trees_idx:  7 / 10 , value:  400
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779883, total= 8.4min
[CV] ................................. , score=0.832839, total= 8.7min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  8.7min remaining: 13.1min


[CV] ................................. , score=0.844207, total= 8.8min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  8.8min remaining:  5.9min


[CV] ................................. , score=0.832152, total= 9.2min
[CV] ................................. , score=0.813151, total= 9.3min
Mean NDCG for this number_trees =  0.82044653311
number_trees_idx:  8 / 10 , value:  500


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  9.3min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  9.3min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779809, total=11.0min
[CV] ................................. , score=0.833054, total=11.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 11.1min remaining: 16.6min


[CV] ................................. , score=0.844278, total=11.5min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 11.5min remaining:  7.7min


[CV] ................................. , score=0.832415, total=12.3min
[CV] ................................. , score=0.813185, total=12.3min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 12.3min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 12.3min finished


Mean NDCG for this number_trees =  0.82054805219
number_trees_idx:  9 / 10 , value:  600
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.780157, total=12.7min
[CV] ................................. , score=0.833097, total=12.9min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 13.0min remaining: 19.4min


[CV] ................................. , score=0.844201, total=13.3min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 13.3min remaining:  8.9min


[CV] ................................. , score=0.832254, total=13.6min
[CV] ................................. , score=0.813263, total=13.7min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 13.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 13.7min finished


Mean NDCG for this number_trees =  0.820594408655
number_trees_idx:  10 / 10 , value:  700
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.780104, total=14.3min
[CV] ................................. , score=0.833215, total=15.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 15.0min remaining: 22.6min


[CV] ................................. , score=0.844377, total=15.1min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 15.1min remaining: 10.1min


[CV] ................................. , score=0.813303, total=15.6min
[CV] ................................. , score=0.832347, total=15.8min
Mean NDCG for this number_trees =  0.82066913368

best NDCG:
0.82066913368
best parameter num_trees:
700


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 15.8min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 15.8min finished


In [10]:
#Loop for 2nd hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees, max_depth=max_depth_value)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max:_depth = ', scores.mean())
    
# best max_depth from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_max_depth = rf_param_depth[idx_best]
print(best_max_depth)

max_depth_idx:  1 / 7 , value:  6
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.828698, total= 2.2min
[CV] ................................. , score=0.821918, total= 2.1min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  2.4min remaining:  3.6min


[CV] ................................. , score=0.818011, total= 2.8min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  2.9min remaining:  1.9min


[CV] ................................. , score=0.766861, total= 3.0min
[CV] ................................. , score=0.798396, total= 3.2min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.2min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  3.2min finished


Mean NDCG for this max:_depth =  0.806776580659
max_depth_idx:  2 / 7 , value:  8
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826189, total= 3.8min
[CV] ................................. , score=0.833133, total= 3.8min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  3.9min remaining:  5.9min


[CV] ................................. , score=0.818392, total= 4.3min
[CV] ................................. , score=0.798421, total= 4.3min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  4.4min remaining:  2.9min


[CV] ................................. , score=0.767152, total= 4.8min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.8min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.8min finished


Mean NDCG for this max:_depth =  0.808657412526
max_depth_idx:  3 / 7 , value:  10
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.837733, total= 5.4min
[CV] ................................. , score=0.830226, total= 5.4min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.6min remaining:  8.5min


[CV] ................................. , score=0.822767, total= 6.4min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  6.4min remaining:  4.3min


[CV] ................................. , score=0.774418, total= 6.5min
[CV] ................................. , score=0.805268, total= 7.2min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.2min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.2min finished


Mean NDCG for this max:_depth =  0.814082406942
max_depth_idx:  4 / 7 , value:  12
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.831468, total= 9.0min
[CV] ................................. , score=0.841676, total= 9.1min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  9.2min remaining: 13.8min


[CV] ................................. , score=0.778407, total= 9.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  9.6min remaining:  6.4min


[CV] ................................. , score=0.828714, total=10.2min
[CV] ................................. , score=0.808415, total=10.2min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 10.2min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 10.2min finished


Mean NDCG for this max:_depth =  0.817736152901
max_depth_idx:  5 / 7 , value:  14
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.780104, total=14.8min
[CV] ................................. , score=0.844377, total=15.8min
[CV] ................................. , score=0.833215, total=15.8min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 16.0min remaining: 24.0min
[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 16.0min remaining: 10.7min


[CV] ................................. , score=0.832347, total=16.5min
[CV] ................................. , score=0.813303, total=17.3min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 17.3min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 17.3min finished


Mean NDCG for this max:_depth =  0.82066913368
max_depth_idx:  6 / 7 , value:  16
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.781131, total=24.1min
[CV] ................................. , score=0.832784, total=24.7min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 24.7min remaining: 37.1min


[CV] ................................. , score=0.845441, total=25.3min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 25.3min remaining: 16.9min


[CV] ................................. , score=0.833810, total=25.9min
[CV] ................................. , score=0.814198, total=26.5min
Mean NDCG for this max:_depth =  0.821472784776
max_depth_idx:  7 / 7 , value:  20


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 26.5min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 26.5min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.779178, total=55.3min
[CV] ................................. , score=0.831379, total=57.8min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 57.8min remaining: 86.6min


[CV] ................................. , score=0.844476, total=59.1min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 59.1min remaining: 39.4min


[CV] ................................. , score=0.833236, total=60.2min
[CV] ................................. , score=0.813526, total=61.5min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 61.5min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 61.5min finished


Mean NDCG for this max:_depth =  0.820359044301

best NDCG:
0.821472784776
best parameter max_depth:
16


## Model 2 : eXtreme Gradient Boosting XCGB

5 folds cross validation, using ndcg as scoring metric.

Grid Search to find best parameter.

In [None]:
learning_rates = [ 0.1,0.2,0.3,0.4]
max_depth = [6, 8, 10, 12, 14, 16, 20]
n_estimators = [10,50,75,100,150,200,300,600]

rf_score_rates = []
rf_score_depth = []
rf_score_estimators = []
rf_param_rates = []
rf_param_depth = []
rf_param_estimators = []

#Loop for 1st hyperparameter n_estimators
for n_estimators_idx, n_estimators_value in enumerate(n_estimators):
    
    print('n_estimators_idx: ',n_estimators_idx+1,'/',len(n_estimators),', value: ', n_estimators_value)

    # XCGB
    model = XGBClassifier(max_depth=10, learning_rate=0.2, n_estimators=n_estimators_value,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_estimators.append(scores.mean())
    rf_param_estimators.append(n_estimators_value)
    print('Mean NDCG for this n_estimators = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_estimators))
print('best parameter num_estimators:')
idx_best = np.argmax(rf_score_estimators)
best_num_estimators = rf_param_estimators[idx_best]
print(best_num_estimators)

n_estimators_idx:  1 / 8 , value:  10
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.845056, total= 1.7min
[CV] ................................. , score=0.833197, total= 1.7min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  1.7min remaining:  2.6min


[CV] ................................. , score=0.780059, total= 1.9min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  2.0min remaining:  1.3min


[CV] ................................. , score=0.835081, total= 2.0min
[CV] ................................. , score=0.814447, total= 2.0min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.0min finished


Mean NDCG for this n_estimators =  0.821568051733
n_estimators_idx:  2 / 8 , value:  50
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.844250, total= 6.8min
[CV] ................................. , score=0.832724, total= 6.9min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  6.9min remaining: 10.3min
Process ForkPoolWorker-221:
Process ForkPoolWorker-228:
Process ForkPoolWorker-226:
Process ForkPoolWorker-227:
Process ForkPoolWorker-223:
Process ForkPoolWorker-220:
Process ForkPoolWorker-222:
Process ForkPoolWorker-225:
Process ForkPoolWorker-224:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiproces

Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 682, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 602, in get
    self.wait(timeout)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 599, in wait
    self._event.wait(timeout)
  File "/home/mint/anaconda3/lib/python3.5/threading.py", line 549, in wait
    signaled = self._cond.wait(timeout)
  File "/home/mint/anaconda3/lib/python3.5/threading.py", line 293, in wait
    waiter.acquire()
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 606, in terminate
    super(MemmapingPool, self).terminate()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", lin

In [None]:
#Loop for  hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # XCGB
    model = XGBClassifier(max_depth=max_depth_value, learning_rate=0.2, n_estimators=10,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max_depth = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_num_depth = rf_param_depth[idx_best]
print(best_num_depth)

max_depth_idx:  1 / 7 , value:  6
ERROR! Session/line number was not unique in database. History logging moved to new session 26
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................


Process ForkPoolWorker-238:
Process ForkPoolWorker-240:
Process ForkPoolWorker-239:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-236:
Process ForkPoolWorker-235:
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-237:
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
   

KeyboardInterrupt: 

In [None]:
#Loop for  hyperparameter learning rate
for learning_rates_idx, learning_rates_value in enumerate(learning_rates):
    
    print('learning_rates_idx: ',learning_rates_idx+1,'/',len(learning_rates),', value: ', learning_rates_value)

    # Random forest
    model = XGBClassifier(max_depth=6, learning_rate=learning_rates_value, n_estimators=10)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_rates.append(scores.mean())
    rf_param_rates.append(learning_rates_value)
    print('Mean NDCG for this learning rate = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_rates))
print('best parameter learning rates:')
idx_best = np.argmax(rf_score_rates)
best_learning_rate = rf_param_rates[idx_best]
print(best_learning_rate)

learning_rates_idx:  1 / 4 , value:  0.1
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................


## Model 3 : SVM


## Voting
Now we are going to vote between the 2 models optimized with their best parameters

# Create the sub models
estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=gridSearchRandomForest.best_estimator_.max_depth, 
                                         n_estimators=gridSearchRandomForest.best_estimator_.n_estimators)
estimators.append(('random_forest', model1))

model2 = XGBClassifier(max_depth=gridSearchXGB.best_estimator_.max_depth, 
                       learning_rate=gridSearchXGB.best_estimator_.learning_rate,
                      n_estimators= gridSearchXGB.best_estimator_.n_estimators,
                      objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)
estimators.append(('xgb', model2))
# Create Voting classifier
finalModel1 = ensemble.VotingClassifier(estimators,voting='soft')
results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer, verbose = 10)
print("Voting Classifier Cross Validation Score found:")
print(results.mean())

## Predict countries from model

In [None]:
finalModel1.fit(X_train,y_labels)
y_pred1 = finalModel1.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)

In [None]:
finalModel2 = model2
finalModel2.fit(X_train,y_labels)
y_pred2 = finalModel.predict_proba(X_test)  
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)

ctsSubmission2 = label_enc.inverse_transform(cts2)

## Convert to csv for submission

In [None]:
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest1.csv',index=False)

In [None]:
df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv('submission_country_dest2.csv',index=False)

Results with :
    
    - **Voting (model1, model2) : 0.86735**
    Random Forest:
        Random Forest Best Score found: 0.795119516323
        Random Forest Best parameters set found:
        {'n_estimators': 50, 'max_depth': 8} 
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
    Voting Classifier:
        Voting Classifier Cross Validation Score found: 0.802676551693

    
    - **model2 : 0.86735**
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
