# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
#from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier
import scipy as sp


## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

### Feature engineering.
Add 3 features : 
- time_mean_user
- time_total_user
- total_action_user


In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

## Cross validation
5 folds cross validation, using ndcg as scoring metric.


In [7]:
#X_train = X_train[100000:100100]
#y_labels = y_labels[100000:100100]
#X_test = X_test[10000:60000]

# Split train dataset into 5 folds 
cv = model_selection.KFold(n_splits=5, random_state=None)

## Machine Learning 
First several algorithms are tried, and optimized through Cross validation and Grid Search. The code is here optimized to run on 3 processors at the same time, as it is very long. See here examples on MacBook Pro 4 cpu, RAM 16GB
Computational time
GridsearchCrossValidation:

- 5000 data
    Random Forest : 27 fits 10,1s
    XGB : 36 fits 60s
    
- 10000 data
    Random Forest : 27 fits 22 s
    XGB : 36 fits 140s
    
- 50000 data
    Random Forest : 27 fits 342
    XGB : 36 fits 1100s

Our final model is composed of a voting classifier composed of the previous models optimized.


Models that were tried:
- **Random Forest** with the following parameters:

    - 'max_depth': [ 4, 6, 8]
    - 'n_estimators': [ 50, 100, 150]


- **eXtreme Gradient Boosting XCGB**:
    - 'max_depth': [6,8,10],
    - 'learning_rate': [0.3],
    - 'n_estimators': [10,15,20,25],
    - 'objective': ['multi:softprob'],
    - 'gamma': [0],
    - 'subsample': [0.5],
    - 'colsample_bytree': [0.5],
    - 'seed': [0]

- Voting classifer:
    - Soft 
    
The metric used is the nDCG.

## Model 1 : RandomForest

Grid Search to find best parameter.

In [8]:
X_train_sparse = sp.sparse.csr_matrix(X_train.values)

number_trees = [ 50, 100, 125, 150, 200, 300, 400, 500, 600, 700]
max_depth = [6, 8, 10, 12, 14, 16, 20, 22]


rf_score_trees = []
rf_score_depth = []
rf_param_trees = []
rf_param_depth = []

#Loop for 1st hyperparameter n_estimators
for number_trees_idx, number_trees_value in enumerate(number_trees):
    
    print('number_trees_idx: ',number_trees_idx+1,'/',len(number_trees),', value: ', number_trees_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=number_trees_value, max_depth=14)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_trees.append(scores.mean())
    rf_param_trees.append(number_trees_value)
    print('Mean NDCG for this number_trees = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_trees))
print('best parameter num_trees:')
idx_best = np.argmax(rf_score_trees)
best_num_trees_RF = rf_param_trees[idx_best]
print(best_num_trees_RF)

#Loop for 2nd hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # Random forest
    rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees, max_depth=max_depth_value)

    #Scores
    scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max:_depth = ', scores.mean())
    
# best max_depth from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_max_depth_RF = rf_param_depth[idx_best]
print(best_max_depth_RF)

Random forest 600 trees, 16 depth NDCG = 0.821472784776

# Predict Countries and convert to CSV for submision

rand_forest_model = ensemble.RandomForestClassifier(n_estimators=700, max_depth=16)
rand_forest_model.fit(X_train_sparse,y_labels)
y_pred1 = rand_forest_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)


df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest_RF.csv',index=False)

## Model 2 : eXtreme Gradient Boosting XCGB

5 folds cross validation, using ndcg as scoring metric.

Grid Search to find best parameter.

In [9]:
learning_rates = [ 0.1,0.2,0.3,0.4]
max_depth = [6, 8, 10, 12, 14, 16, 20]
n_estimators = [10,30,50,75,100]

rf_score_rates = []
rf_score_depth = []
rf_score_estimators = []
rf_param_rates = []
rf_param_depth = []
rf_param_estimators = []

#Loop for 1st hyperparameter n_estimators
for n_estimators_idx, n_estimators_value in enumerate(n_estimators):
    
    print('n_estimators_idx: ',n_estimators_idx+1,'/',len(n_estimators),', value: ', n_estimators_value)

    # XCGB
    model = XGBClassifier(max_depth=10, learning_rate=0.2, n_estimators=n_estimators_value,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_estimators.append(scores.mean())
    rf_param_estimators.append(n_estimators_value)
    print('Mean NDCG for this n_estimators = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_estimators))
print('best parameter num_estimators:')
idx_best = np.argmax(rf_score_estimators)
best_num_estimators_XCG = rf_param_estimators[idx_best]
print(best_num_estimators_XCG)

n_estimators_idx:  1 / 5 , value:  10
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ................................. , score=0.842489, total= 1.4min
[CV] ................................. , score=0.832185, total= 1.5min


[Parallel(n_jobs=12)]: Done   2 out of   5 | elapsed:  1.5min remaining:  2.3min


[CV] ................................. , score=0.831306, total= 1.6min


[Parallel(n_jobs=12)]: Done   3 out of   5 | elapsed:  1.6min remaining:  1.0min


[CV] ................................. , score=0.784253, total= 1.6min
[CV] ................................. , score=0.813171, total= 1.6min
Mean NDCG for this n_estimators =  0.820680718775
n_estimators_idx:  2 / 5 , value:  30


[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  1.6min remaining:    0.0s
[Parallel(n_jobs=12)]: Done   5 out of   5 | elapsed:  1.6min finished


[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................


Process ForkPoolWorker-20:
Process ForkPoolWorker-21:
Process ForkPoolWorker-23:
Process ForkPoolWorker-18:
Process ForkPoolWorker-24:
Process ForkPoolWorker-22:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-10c5c787c41d>", line 22, in <module>
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/model_selection/_validation.py", line 140, in cross_val_score
    for train, test in cv_iter)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__
    self.retrieve()
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve
    raise exception
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 682, in retrieve
    self._output.extend(job.get(t

KeyboardInterrupt: 

In [None]:
#Loop for  hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
    
    print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)

    # XCGB
    model = XGBClassifier(max_depth=max_depth_value, learning_rate=0.2, n_estimators=10,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_depth.append(scores.mean())
    rf_param_depth.append(max_depth_value)
    print('Mean NDCG for this max_depth = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_num_depth_XCG = rf_param_depth[idx_best]
print(best_num_depth_XCG)

max_depth_idx:  1 / 7 , value:  6


Process ForkPoolWorker-28:
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Process ForkPoolWorker-27:
Process ForkPoolWorker-25:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
 

KeyboardInterrupt: 

Process ForkPoolWorker-29:
Process ForkPoolWorker-26:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/mint/anaconda3/lib/python3.5/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/home/mint/anaconda3/lib/python3.5/site-packages/sklearn/externals/joblib/pool.py", line 360, in get
    racquire()
  File "/home/mint/anaconda3/lib/python3.5/site-pa

In [None]:
#Loop for  hyperparameter learning rate
for learning_rates_idx, learning_rates_value in enumerate(learning_rates):
    
    print('learning_rates_idx: ',learning_rates_idx+1,'/',len(learning_rates),', value: ', learning_rates_value)

    # XGB
    XGB_model = XGBClassifier(max_depth=6, learning_rate=learning_rates_value, n_estimators=10,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    rf_score_rates.append(scores.mean())
    rf_param_rates.append(learning_rates_value)
    print('Mean NDCG for this learning rate = ', scores.mean())

# best number of trees from above
print() 
print('best NDCG:')
print(np.max(rf_score_rates))
print('best parameter learning rates:')
idx_best = np.argmax(rf_score_rates)
best_learning_rate_XCG = rf_param_rates[idx_best]
print(best_learning_rate_XCG)

learning_rates_idx:  1 / 4 , value:  0.1


ImportError: [joblib] Attempting to do parallel computing without protecting your import on a system that does not support forking. To use parallel-computing in a script, you must protect your main loop using "if __name__ == '__main__'". Please see the joblib documentation on Parallel for more information

ERROR:tornado.general:Uncaught exception, closing connection.
Traceback (most recent call last):
  File "/home/mint/anaconda3/lib/python3.5/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/tornado/stack_context.py", line 275, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 276, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 228, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/mint/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 412, in execute_request
    self._abort_queues()
  File "/home/mint/anaconda3/lib/python3.5/site-packages/ipykernel/kernelbase.py", line 628, in _abort_queues
    self._abort_queue(stream)
  File "/home/mint/anaconda3/lib/python3.5/site

# Predict Countries and convert to CSV for submision

In [None]:
XGB_model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=best_learning_rate_XCG, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)
XGB_model.fit(X_train,y_labels)
y_pred2 = XGB_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)

ctsSubmission2 = label_enc.inverse_transform(cts2)


df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv('submission_country_dest_XGB.csv',index=False)

## Model 3 : SVM


In [None]:
C = [1e-2,1e-1,1,1e1,1e2]

SVM_score_C = []
SVM_param_C = []

#Loop for 1st hyperparameter n_estimators
for C_idx, C_value in enumerate(C):
    
    print('C_idx: ',C_idx+1,'/',len(C),', value: ', C_value)

    # SVM
    SVM_model = sklearn.svm.SVC(C = C_value, probability=True)

    #Scores
    scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
    SVM_score_C.append(scores.mean())
    SVM_param_C.append(C_value)
    print('Mean NDCG for this C = ', scores.mean())

# best number of estimators from above
print() 
print('best NDCG:')
print(np.max(SVM_score_C))
print('best parameter num_estimators:')
idx_best = np.argmax(SVM_score_C)
best_C_SVM = SVM_param_C[idx_best]
print(best_C_SVM)

# Predict Countries and convert to CSV for submision

In [None]:
SVM_model = sklearn.svm.SVC(C = best_C_SVM, probability=True)
SVM_model.fit(X_train_sparse,y_labels)
y_pred3 = SVM_model.predict_proba(X_test)  
id_test = df_test_users['id']
cts3,idsubmission3 = machine_learning_helper.get5likelycountries(y_pred3, id_test)

ctsSubmission3 = label_enc.inverse_transform(cts3)


df_submission3 = pd.DataFrame(np.column_stack((idsubmission3, ctsSubmission3)), columns=['id', 'country'])
df_submission3.to_csv('submission_country_dest_SVM.csv',index=False)

#SVM
model = sklearn.svm.SVC(tol=0.0001,  verbose=0, random_state=None, probability=True,max_iter=1000)
#model.fit(X_train_sparse,y_labels)

#Scores
scores = model_selection.cross_val_score(model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 3, scoring=metrics_helper.ndcg_scorer)


## Voting
Now we are going to vote between the 3 models optimized with their best parameters

In [None]:
# Create the sub models
best_max_depth_RF =16 
best_num_trees_RF = 700

estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=best_max_depth_RF, n_estimators= best_num_trees_RF)
estimators.append(('random_forest', model1))

model2 = XGBClassifier(max_depth=best_num_depth_XCG,learning_rate=best_learning_rate_XCG,n_estimators= best_num_estimators_XCG,
                      objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)
estimators.append(('xgb', model2))

model3 = sklearn.svm.SVC(C = best_C_SVM, tol=0.0001,  verbose=0, random_state=None, probability=True,max_iter=1000)
estimators.append(('svm', model3))

# Create Voting classifier
finalModel = ensemble.VotingClassifier(estimators,voting='soft')

results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer, verbose = 10)
print("Voting Classifier Cross Validation Score found:")
print(results.mean())

## Predict countries from model

In [None]:
finalModel.fit(X_train,y_labels)
y_pred1 = finalModel.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)

finalModel2 = model2
finalModel2.fit(X_train,y_labels)
y_pred2 = finalModel.predict_proba(X_test)  
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)

ctsSubmission2 = label_enc.inverse_transform(cts2)

## Convert to csv for submission

In [None]:
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest_Voting.csv',index=False)

df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv('submission_country_dest2.csv',index=False)

Results with :
    
    - **Voting (model1, model2) : 0.86735**
    Random Forest:
        Random Forest Best Score found: 0.795119516323
        Random Forest Best parameters set found:
        {'n_estimators': 50, 'max_depth': 8} 
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
    Voting Classifier:
        Voting Classifier Cross Validation Score found: 0.802676551693

    
    - **model2 : 0.86735**
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
