# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier
import scipy as sp
import xgboost


## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

### Feature engineering.
Add 3 features : 
- time_mean_user
- time_total_user
- total_action_user


In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

## Cross validation
5 folds cross validation, using ndcg as scoring metric.


In [53]:
#X_train = X_train[100000:110000]
#y_labels = y_labels[100000:110000]

# Split train dataset into 5 folds, completely shuffled
cv = model_selection.KFold(n_splits=5, random_state=None, shuffle=True)
for train_id, test_id in cv.split(X_train):
    print(train_id, test_id)

[     0      1      2 ..., 213448 213449 213450] [     3      7      9 ..., 213443 213444 213445]
[     0      3      5 ..., 213446 213449 213450] [     1      2      4 ..., 213441 213447 213448]
[     0      1      2 ..., 213447 213448 213450] [     6     17     24 ..., 213426 213446 213449]
[     1      2      3 ..., 213447 213448 213449] [     0      5      8 ..., 213433 213437 213450]
[     0      1      2 ..., 213448 213449 213450] [    11     13     16 ..., 213434 213438 213440]


## Machine Learning 
First several algorithms are tried, and optimized through Cross validation and Grid Search. The code is here optimized to run on 3 processors at the same time, as it is very long. See here examples on MacBook Pro 4 cpu, RAM 16GB
Computational time
GridsearchCrossValidation:

- 5000 data
    Random Forest : 27 fits 10,1s
    XGB : 36 fits 60s
    
- 10000 data
    Random Forest : 27 fits 22 s
    XGB : 36 fits 140s
    
- 50000 data
    Random Forest : 27 fits 342
    XGB : 36 fits 1100s

Our final model is composed of a voting classifier composed of the previous models optimized.


Models that were tried:
- **Random Forest** with the following parameters:

    - 'max_depth': [ 4, 6, 8]
    - 'n_estimators': [ 50, 100, 150]


- **eXtreme Gradient Boosting XCGB**:
    - 'max_depth': [6,8,10],
    - 'learning_rate': [0.3],
    - 'n_estimators': [10,15,20,25],
    - 'objective': ['multi:softprob'],
    - 'gamma': [0],
    - 'subsample': [0.5],
    - 'colsample_bytree': [0.5],
    - 'seed': [0]

- Voting classifer:
    - Soft 
    
The metric used is the nDCG.

## Model 1 : RandomForest

Grid Search to find best parameter.

In [8]:
X_train_sparse = sp.sparse.csr_matrix(X_train.values)

In [None]:
number_trees = [125, 300, 500, 600  ]
max_depth = [5, 8, 12, 16, 20]

rf_score_trees = []
rf_score_depth = []
rf_param_trees = []
rf_param_depth = []

#Loop for 1st hyperparameter n_estimators
for number_trees_idx, number_trees_value in enumerate(number_trees):
    
    print('number_trees_idx: ',number_trees_idx+1,'/',len(number_trees),', value: ', number_trees_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=number_trees_value, max_depth=14)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_trees.append(scores.mean())
    rf_param_trees.append(number_trees_value)
    print('Mean NDCG for this number_trees = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_trees))
print('best parameter num_trees:')
idx_best = np.argmax(rf_score_trees)
best_num_trees_RF = rf_param_trees[idx_best]
print(best_num_trees_RF)

In [None]:
#Loop for  hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees_RF, max_depth=max_depth_value)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max:_depth = ', scores.mean())
    
# best max_depth from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_max_depth_RF = rf_param_depth[idx_best]
print(best_max_depth_RF)

Random forest 600 trees, 16 depth NDCG = 0.821472784776

# Predict Countries and convert to CSV for submision

In [9]:
best_num_trees_RF = 600
best_max_depth_RF = 16

rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees_RF, max_depth=best_max_depth_RF)
rand_forest_model.fit(X_train_sparse,y_labels)
y_pred1 = rand_forest_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)


df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest_RF.csv',index=False)

## Model 2 : eXtreme Gradient Boosting XCGB

5 folds cross validation, using ndcg as scoring metric.

Grid Search to find best parameter.

In [54]:
learning_rates = [0.001, 0.01, 0.05,0.1, 0.2]
max_depth = [3, 5, 7, 9, 12]
n_estimators = [20,30,50,75,100]
gamma = [0,0.3, 0.5, 0.7, 1]

rf_score_rates = []
rf_score_depth = []
rf_score_estimators = []
rf_score_gamma = []
rf_param_rates = []
rf_param_depth = []
rf_param_estimators = []
rf_param_gamma = []

In [55]:
#Loop for  hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # XCGB
    model = XGBClassifier(max_depth=max_depth_value, learning_rate=0.1, n_estimators=100,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma=0.5 )

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max_depth = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_num_depth_XCG = rf_param_depth[idx_best]
print(best_num_depth_XCG)

max_depth_idx:  1 / 5 , value:  3
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.825536, total= 4.3min
[CV] ................................. , score=0.824440, total= 4.4min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  4.4min remaining:  6.6min


[CV] ................................. , score=0.825843, total= 4.5min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  4.5min remaining:  3.0min


[CV] ................................. , score=0.827577, total= 4.5min
[CV] ................................. , score=0.825573, total= 4.5min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.5min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.5min finished


Mean NDCG for this max_depth =  0.825793832774
max_depth_idx:  2 / 5 , value:  5
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826692, total= 5.9min
[CV] ................................. , score=0.826686, total= 6.1min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  6.1min remaining:  9.1min


[CV] ................................. , score=0.825197, total= 6.1min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  6.1min remaining:  4.1min


[CV] ................................. , score=0.825876, total= 6.1min
[CV] ................................. , score=0.826545, total= 6.3min
Mean NDCG for this max_depth =  0.826199066592
max_depth_idx:  3 / 5 , value:  7


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  6.3min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  6.3min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826934, total= 7.3min
[CV] ................................. , score=0.825241, total= 7.3min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  7.3min remaining: 11.0min


[CV] ................................. , score=0.825647, total= 7.4min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  7.4min remaining:  4.9min


[CV] ................................. , score=0.826031, total= 7.4min
[CV] ................................. , score=0.827572, total= 7.4min
Mean NDCG for this max_depth =  0.826284851913
max_depth_idx:  4 / 5 , value:  9


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.4min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.4min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826261, total= 8.7min
[CV] ................................. , score=0.825134, total= 9.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  9.0min remaining: 13.4min


[CV] ................................. , score=0.827389, total= 9.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  9.0min remaining:  6.0min


[CV] ................................. , score=0.824617, total= 9.0min
[CV] ................................. , score=0.826377, total= 9.0min
Mean NDCG for this max_depth =  0.825955572991
max_depth_idx:  5 / 5 , value:  12


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  9.0min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  9.0min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.825927, total=10.8min
[CV] ................................. , score=0.823658, total=10.9min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 10.9min remaining: 16.4min


[CV] ................................. , score=0.823432, total=11.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 11.0min remaining:  7.3min


[CV] ................................. , score=0.821969, total=11.0min
[CV] ................................. , score=0.826961, total=11.1min
Mean NDCG for this max_depth =  0.824389547542

best NDCG:
0.826284851913
best parameter max_depth:
7


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 11.1min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 11.1min finished


In [57]:
#Loop for hyperparameter n_estimators
for n_estimators_idx, n_estimators_value in enumerate(n_estimators):
    
    print('n_estimators_idx: ',n_estimators_idx+1,'/',len(n_estimators),', value: ', n_estimators_value)

    # XCGB
    model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=0.1, n_estimators=n_estimators_value,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma=0.5 )

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_estimators.append(scores.mean())
    rf_param_estimators.append(n_estimators_value)
    print('Mean NDCG for this n_estimators = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_estimators))
print('best parameter num_estimators:')
idx_best = np.argmax(rf_score_estimators)
best_num_estimators_XCG = rf_param_estimators[idx_best]
print(best_num_estimators_XCG)

n_estimators_idx:  1 / 5 , value:  20
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.822629, total= 1.9min
[CV] ................................. , score=0.821990, total= 2.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min


[CV] ................................. , score=0.825919, total= 2.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  2.0min remaining:  1.4min


[CV] ................................. , score=0.824131, total= 2.1min
[CV] ................................. , score=0.825549, total= 2.1min
Mean NDCG for this n_estimators =  0.824043783229
n_estimators_idx:  2 / 5 , value:  30


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.1min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.825394, total= 2.5min
[CV] ................................. , score=0.824396, total= 2.6min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  2.6min remaining:  4.0min


[CV] ................................. , score=0.826880, total= 2.7min
[CV] ................................. , score=0.825361, total= 2.7min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  2.7min remaining:  1.8min


[CV] ................................. , score=0.825667, total= 2.7min
Mean NDCG for this n_estimators =  0.825539657253
n_estimators_idx:  3 / 5 , value:  50


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  2.7min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.825074, total= 3.8min
[CV] ................................. , score=0.825888, total= 4.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  4.0min remaining:  5.9min


[CV] ................................. , score=0.827658, total= 4.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  4.0min remaining:  2.7min


[CV] ................................. , score=0.826367, total= 4.0min
[CV] ................................. , score=0.825776, total= 4.0min
Mean NDCG for this n_estimators =  0.826152648014
n_estimators_idx:  4 / 5 , value:  75


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.1min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  4.1min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.828479, total= 5.7min
[CV] ................................. , score=0.827377, total= 5.8min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.8min remaining:  8.7min


[CV] ................................. , score=0.826013, total= 5.8min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.8min remaining:  3.9min


[CV] ................................. , score=0.827077, total= 5.9min
[CV] ................................. , score=0.822821, total= 5.9min
Mean NDCG for this n_estimators =  0.826353492278
n_estimators_idx:  5 / 5 , value:  100


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.9min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.827061, total= 7.4min
[CV] ................................. , score=0.827790, total= 7.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  7.5min remaining: 11.3min


[CV] ................................. , score=0.823881, total= 7.6min
[CV] ................................. , score=0.826231, total= 7.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  7.6min remaining:  5.0min


[CV] ................................. , score=0.825589, total= 7.7min
Mean NDCG for this n_estimators =  0.826110503387

best NDCG:
0.826353492278
best parameter num_estimators:
75


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  7.7min finished


In [58]:
#Loop for  hyperparameter learning rate
for gamma_idx, gamma_value in enumerate(gamma):
    
    print('gamma_idx: ',gamma_idx+1,'/',len(gamma),', value: ', gamma_value)

    # XGB
    model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=0.1, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma=gamma_value )

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_gamma.append(scores.mean())
    rf_param_gamma.append(gamma_value)
    print('Mean NDCG for this gamma = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_gamma))
print('best parameter gamma:')
idx_best = np.argmax(rf_score_gamma)
best_gamma_XCG = rf_param_gamma[idx_best]
print(best_gamma_XCG)

gamma_idx:  1 / 5 , value:  0
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.827068, total= 5.4min
[CV] ................................. , score=0.826770, total= 5.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.5min remaining:  8.3min


[CV] ................................. , score=0.824673, total= 5.5min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.6min remaining:  3.7min


[CV] ................................. , score=0.827431, total= 5.6min
[CV] ................................. , score=0.825960, total= 5.6min
Mean NDCG for this gamma =  0.826380308232
gamma_idx:  2 / 5 , value:  0.3


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.825921, total= 5.4min
[CV] ................................. , score=0.826705, total= 5.6min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.6min remaining:  8.4min


[CV] ................................. , score=0.824183, total= 5.6min
[CV] ................................. , score=0.826090, total= 5.6min
[CV] ................................. , score=0.827912, total= 5.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.6min remaining:  3.8min
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min finished


Mean NDCG for this gamma =  0.826162073693
gamma_idx:  3 / 5 , value:  0.5
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826908, total= 5.4min
[CV] ................................. , score=0.825086, total= 5.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.5min remaining:  8.3min


[CV] ................................. , score=0.827914, total= 5.5min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.5min remaining:  3.7min


[CV] ................................. , score=0.825519, total= 5.6min
[CV] ................................. , score=0.826324, total= 5.6min
Mean NDCG for this gamma =  0.826350165748
gamma_idx:  4 / 5 , value:  0.7


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.828562, total= 5.5min
[CV] ................................. , score=0.826191, total= 5.7min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.7min remaining:  8.6min


[CV] ................................. , score=0.826148, total= 5.7min
[CV] ................................. , score=0.823935, total= 5.7min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.7min remaining:  3.8min


[CV] ................................. , score=0.827004, total= 5.8min
Mean NDCG for this gamma =  0.82636790693
gamma_idx:  5 / 5 , value:  1


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.8min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.8min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826725, total= 5.5min
[CV] ................................. , score=0.826944, total= 5.7min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.7min remaining:  8.5min


[CV] ................................. , score=0.827410, total= 5.7min
[CV] ................................. , score=0.826190, total= 5.7min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.7min remaining:  3.8min


[CV] ................................. , score=0.824719, total= 5.7min
Mean NDCG for this gamma =  0.826397519287

best NDCG:
0.826397519287
best parameter gamma:
1


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.7min finished


In [59]:
#Loop for  hyperparameter gamma
for learning_rates_idx, learning_rates_value in enumerate(learning_rates):
    
    print('learning_rates_idx: ',learning_rates_idx+1,'/',len(learning_rates),', value: ', learning_rates_value)

    # XGB
    model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=learning_rates_value, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma=best_gamma_XCG )

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_rates.append(scores.mean())
    rf_param_rates.append(learning_rates_value)
    print('Mean NDCG for this learning rate = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_rates))
print('best parameter learning rates:')
idx_best = np.argmax(rf_score_rates)
best_learning_rate_XCG = rf_param_rates[idx_best]
print(best_learning_rate_XCG)

learning_rates_idx:  1 / 5 , value:  0.001
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.819375, total= 4.9min
[CV] ................................. , score=0.819680, total= 5.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.0min remaining:  7.4min


[CV] ................................. , score=0.820594, total= 5.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.0min remaining:  3.4min


[CV] ................................. , score=0.818348, total= 5.1min
[CV] ................................. , score=0.818750, total= 5.1min
Mean NDCG for this learning rate =  0.819349504012
learning_rates_idx:  2 / 5 , value:  0.01


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.1min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.1min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.823754, total= 5.3min
[CV] ................................. , score=0.822457, total= 5.4min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.4min remaining:  8.0min


[CV] ................................. , score=0.823466, total= 5.4min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.4min remaining:  3.6min


[CV] ................................. , score=0.823187, total= 5.5min
[CV] ................................. , score=0.824220, total= 5.5min
Mean NDCG for this learning rate =  0.823416694986
learning_rates_idx:  3 / 5 , value:  0.05


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.5min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.5min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.827183, total= 5.5min
[CV] ................................. , score=0.826882, total= 5.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.5min remaining:  8.3min


[CV] ................................. , score=0.825618, total= 5.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.6min remaining:  3.7min


[CV] ................................. , score=0.825527, total= 5.6min
[CV] ................................. , score=0.824709, total= 5.6min
Mean NDCG for this learning rate =  0.825983764444
learning_rates_idx:  4 / 5 , value:  0.1


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.6min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826467, total= 5.8min
[CV] ................................. , score=0.826080, total= 6.0min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  6.0min remaining:  9.0min


[CV] ................................. , score=0.826904, total= 6.0min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  6.0min remaining:  4.0min


[CV] ................................. , score=0.825833, total= 6.0min
[CV] ................................. , score=0.825826, total= 6.0min
Mean NDCG for this learning rate =  0.82622187548
learning_rates_idx:  5 / 5 , value:  0.2


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  6.1min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  6.1min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.824451, total= 5.6min
[CV] ................................. , score=0.826745, total= 5.6min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  5.6min remaining:  8.5min


[CV] ................................. , score=0.826576, total= 5.7min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  5.7min remaining:  3.8min


[CV] ................................. , score=0.825596, total= 5.7min
[CV] ................................. , score=0.824270, total= 5.8min
Mean NDCG for this learning rate =  0.825527656512

best NDCG:
0.82622187548
best parameter learning rates:
0.1


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.8min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  5.8min finished


# Predict Countries and convert to CSV for submision

In [60]:
XGB_model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=best_learning_rate_XCG, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma = best_gamma_XCG)
XGB_model.fit(X_train,y_labels)
y_pred2 = XGB_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)

ctsSubmission2 = label_enc.inverse_transform(cts2)


df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv('submission_country_dest_XGB.csv',index=False)

## Model 3 : SVM


In [63]:
C = [1,1e1,1e2, 1e3]

SVM_score_C = []
SVM_param_C = []

#Loop for 1st hyperparameter n_estimators
for C_idx, C_value in enumerate(C):
    
    print('C_idx: ',C_idx+1,'/',len(C),', value: ', C_value)

    # SVM
    model = sklearn.svm.SVC(C = C_value, probability=True)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    SVM_score_C.append(scores.mean())
    SVM_param_C.append(C_value)
    print('Mean NDCG for this C = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(SVM_score_C))
print('best parameter C:')
idx_best = np.argmax(SVM_score_C)
best_C_SVM = SVM_param_C[idx_best]
print(best_C_SVM)

C_idx:  1 / 5 , value:  0.1
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................


Process ForkPoolWorker-332:
Process ForkPoolWorker-330:
Process ForkPoolWorker-334:
Process ForkPoolWorker-331:
Process ForkPoolWorker-336:
Process ForkPoolWorker-335:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-333:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootst

TypeError: catching classes that do not inherit from BaseException is not allowed

# Predict Countries and convert to CSV for submision

In [62]:
SVM_model = sklearn.svm.SVC(C = best_C_SVM, probability=True)
SVM_model.fit(X_train_sparse,y_labels)
y_pred3 = SVM_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts3,idsubmission3 = machine_learning_helper.get5likelycountries(y_pred3, id_test)

ctsSubmission3 = label_enc.inverse_transform(cts3)


df_submission3 = pd.DataFrame(np.column_stack((idsubmission3, ctsSubmission3)), columns=['id', 'country'])
df_submission3.to_csv('submission_country_dest_SVM.csv',index=False)

NameError: name 'best_C_SVM' is not defined

#SVM
model = sklearn.svm.SVC(tol=0.0001,  verbose=0, random_state=None, probability=True,max_iter=1000)
#model.fit(X_train_sparse,y_labels)

#Scores
scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 3, scoring=metrics_helper.ndcg_scorer)


## Voting
Now we are going to vote between the 3 models optimized with their best parameters

In [67]:
# Create the sub models
estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=best_max_depth_RF, n_estimators= best_num_trees_RF)
estimators.append(('random_forest', model1))

model2 = XGBClassifier(max_depth=best_num_depth_XCG,learning_rate=best_learning_rate_XCG,n_estimators= best_num_estimators_XCG,
                      objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, gamma = best_gamma_XCG)
estimators.append(('xgb', model2))

#model3 = sklearn.svm.SVC(C = best_C_SVM, probability=True)
#estimators.append(('svm', model3))

# Create Voting classifier
finalModel = ensemble.VotingClassifier(estimators,voting='soft')

# Run cross validation score
results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer, verbose = 10, n_jobs=12)
print("Voting Classifier Cross Validation Score found:")
print(results.mean())

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.826263, total=23.7min
[CV] ................................. , score=0.827403, total=23.8min
[CV] ................................. , score=0.825203, total=23.8min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed: 23.8min remaining: 35.7min
[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed: 23.8min remaining: 15.9min


[CV] ................................. , score=0.826588, total=23.8min
[CV] ................................. , score=0.824282, total=23.9min


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 23.9min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed: 23.9min finished


Voting Classifier Cross Validation Score found:
0.82594782673


## Predict countries from model

In [68]:
finalModel.fit(X_train,y_labels)
y_pred1 = finalModel.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)

## Convert to csv for submission

In [69]:
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest_Voting.csv',index=False)

Results with :
    
    - **Voting (model1, model2) : 0.86735**
    Random Forest:
        Random Forest Best Score found: 0.795119516323
        Random Forest Best parameters set found:
        {'n_estimators': 50, 'max_depth': 8} 
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
    Voting Classifier:
        Voting Classifier Cross Validation Score found: 0.802676551693

    
    - **model2 : 0.86735**
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
