# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier


## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [16]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

### Feature engineering.
Add 3 features : 
- time_mean_user
- time_total_user
- total_action_user


In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

In [14]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

## Cross validation
5 folds cross validation, using ndcg as scoring metric.


In [7]:
#X_train = X_train[10000:60000]
#y_labels = y_labels[10000:60000]
#X_test = X_test[10000:60000]

# Split train dataset into 3 folds 
cv = model_selection.KFold(n_splits=3, random_state=None)

## Machine Learning 
First several algorithms are tried, and optimized through Cross validation and Grid Search. The code is here optimized to run on 3 processors at the same time, as it is very long. See here examples on MacBook Pro 4 cpu, RAM 16GB
Computational time
GridsearchCrossValidation:

- 5000 data
    Random Forest : 27 fits 10,1s
    XGB : 36 fits 60s
    
- 10000 data
    Random Forest : 27 fits 22 s
    XGB : 36 fits 140s
    
- 50000 data
    Random Forest : 27 fits 342
    XGB : 36 fits 1100s

Our final model is composed of a voting classifier composed of the previous models optimized.


Models that were tried:
- Random Forest with the following parameters:

    - 'max_depth': [ 4, 6, 8]
    - 'n_estimators': [ 50, 100, 150]


- eXtreme Gradient Boosting XCGB:
    - 'max_depth': [6,8,10],
    - 'learning_rate': [0.3],
    - 'n_estimators': [10,15,20,25],
    - 'objective': ['multi:softprob'],
    - 'gamma': [0],
    - 'subsample': [0.5],
    - 'colsample_bytree': [0.5],
    - 'seed': [0]

- Voting classifer:
    - Soft 
    
The metric used is the nDCG.

## Model 1 : RandomForest

Grid Search to find best parameter.

In [8]:
# Define parameters to tune the model
tune_parameters = {'max_depth': [ 4, 6, 8, 10], 'n_estimators': [ 50, 100, 150]}

# Define random forest model
model = ensemble.RandomForestClassifier(warm_start=True)

# Define GridSearch with crossValidation
gridSearchRandomForest = model_selection.GridSearchCV(model, tune_parameters, cv=cv,scoring=metrics_helper.ndcg_scorer, n_jobs=3, verbose = 10)

# Fit model to data
gridSearchRandomForest.fit(X_train, y_labels)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] n_estimators=50, max_depth=4 ....................................
[CV] n_estimators=50, max_depth=4 ....................................
[CV] n_estimators=50, max_depth=4 ....................................
[CV] ........... n_estimators=50, max_depth=4, score=0.758837 -   6.7s
[CV] n_estimators=100, max_depth=4 ...................................
[CV] ........... n_estimators=50, max_depth=4, score=0.807243 -   6.6s
[CV] n_estimators=100, max_depth=4 ...................................
[CV] ........... n_estimators=50, max_depth=4, score=0.802874 -   6.7s
[CV] n_estimators=100, max_depth=4 ...................................


[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   33.8s


[CV] .......... n_estimators=100, max_depth=4, score=0.758747 -   6.5s
[CV] n_estimators=150, max_depth=4 ...................................
[CV] .......... n_estimators=100, max_depth=4, score=0.800109 -   6.3s
[CV] n_estimators=150, max_depth=4 ...................................
[CV] .......... n_estimators=100, max_depth=4, score=0.807313 -   6.4s
[CV] n_estimators=150, max_depth=4 ...................................
[CV] .......... n_estimators=150, max_depth=4, score=0.758608 -   6.3s
[CV] n_estimators=50, max_depth=6 ....................................


[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  1.8min


[CV] .......... n_estimators=150, max_depth=4, score=0.807313 -   6.3s
[CV] .......... n_estimators=150, max_depth=4, score=0.802008 -   6.2s
[CV] n_estimators=50, max_depth=6 ....................................
[CV] n_estimators=50, max_depth=6 ....................................
[CV] ........... n_estimators=50, max_depth=6, score=0.805435 -   7.0s
[CV] ........... n_estimators=50, max_depth=6, score=0.760681 -   6.8s
[CV] ........... n_estimators=50, max_depth=6, score=0.807127 -   6.8s
[CV] n_estimators=100, max_depth=6 ...................................
[CV] n_estimators=100, max_depth=6 ...................................
[CV] n_estimators=100, max_depth=6 ...................................


[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  2.4min


[CV] .......... n_estimators=100, max_depth=6, score=0.805567 -   8.7s
[CV] n_estimators=150, max_depth=6 ...................................
[CV] .......... n_estimators=100, max_depth=6, score=0.807363 -   8.8s
[CV] .......... n_estimators=100, max_depth=6, score=0.761121 -   8.4s
[CV] n_estimators=150, max_depth=6 ...................................
[CV] n_estimators=150, max_depth=6 ...................................
[CV] .......... n_estimators=150, max_depth=6, score=0.804919 -   6.6s
[CV] n_estimators=50, max_depth=8 ....................................
[CV] .......... n_estimators=150, max_depth=6, score=0.807175 -   6.6s
[CV] n_estimators=50, max_depth=8 ....................................
[CV] .......... n_estimators=150, max_depth=6, score=0.761374 -   6.7s
[CV] n_estimators=50, max_depth=8 ....................................
[CV] ........... n_estimators=50, max_depth=8, score=0.807776 -   5.7s
[CV] n_estimators=100, max_depth=8 ...................................


[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:  4.3min


[CV] ........... n_estimators=50, max_depth=8, score=0.812423 -   6.1s
[CV] n_estimators=100, max_depth=8 ...................................
[CV] ........... n_estimators=50, max_depth=8, score=0.765160 -   6.1s
[CV] n_estimators=100, max_depth=8 ...................................
[CV] .......... n_estimators=100, max_depth=8, score=0.766011 -   6.3s
[CV] n_estimators=150, max_depth=8 ...................................
[CV] .......... n_estimators=100, max_depth=8, score=0.808071 -   6.0s
[CV] n_estimators=150, max_depth=8 ...................................
[CV] .......... n_estimators=100, max_depth=8, score=0.809802 -   6.1s
[CV] n_estimators=150, max_depth=8 ...................................
[CV] .......... n_estimators=150, max_depth=8, score=0.764560 -   6.6s


[Parallel(n_jobs=3)]: Done  25 out of  27 | elapsed:  5.7min remaining:   27.4s


[CV] .......... n_estimators=150, max_depth=8, score=0.810400 -   6.4s
[CV] .......... n_estimators=150, max_depth=8, score=0.807945 -   6.6s


[Parallel(n_jobs=3)]: Done  27 out of  27 | elapsed:  5.7min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=True),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [50, 100, 150], 'max_depth': [4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(ndcg_score, needs_proba=True, k=5), verbose=10)

In [9]:
print("Random Forest Best Score found:")
print(gridSearchRandomForest.best_score_)
print("Random Forest Best parameters set found:")
print(gridSearchRandomForest.best_params_)

Random Forest Best Score found:
0.795119516323
Random Forest Best parameters set found:
{'n_estimators': 50, 'max_depth': 8}


## Model 2 : eXtreme Gradient Boosting XCGB

5 folds cross validation, using ndcg as scoring metric.

Grid Search to find best parameter.

In [10]:
# Define parameters to tune the model
tune_parameters = {
    'max_depth': [6,8,10],
    'learning_rate': [0.3],
    'n_estimators': [10,15,20,25],
    'objective': ['multi:softprob'],
    'gamma': [0],
    'subsample': [0.5],
    'colsample_bytree': [0.5],
    'seed': [0]
}

# Define eXtreme Gradient Boosting model
model = XGBClassifier()

# Define GridSearch with crossValidation
gridSearchXGB = model_selection.GridSearchCV(model, tune_parameters, cv=cv,scoring=metrics_helper.ndcg_scorer, n_jobs=3, verbose = 10)

# Fit model to data
gridSearchXGB.fit(X_train, y_labels)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=6, seed=0, subsample=0.5 
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=6, seed=0, subsample=0.5 
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=6, seed=0, subsample=0.5, score=0.808574 -   7.2s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=6, seed=0, subsample=0.5, score=0.771561 -   7.3s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_es

[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  1.1min


[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=6, seed=0, subsample=0.5, score=0.772696 -   7.4s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=6, seed=0, subsample=0.5, score=0.811706 -   7.3s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=6, seed=0, subsample=0.5, score=0.820233 -   7.5s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, s

[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  4.2min


[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, subsample=0.5, score=0.812808 -   7.8s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=6, seed=0, subsample=0.5, score=0.819907 -   7.6s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=6, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=6, seed=0, subsample=0.5, score=0.812784 -   7.1s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=8, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=6, seed=0, s

[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  6.0min


[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=8, seed=0, subsample=0.5, score=0.772323 -   6.8s
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=8, seed=0, subsample=0.5, score=0.807176 -   6.8s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=8, seed=0, subsample=0.5 
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=8, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=8, seed=0, subsample=0.5, score=0.818599 -   6.6s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=8, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=8, seed=0, s

[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed: 10.2min


[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=8, seed=0, subsample=0.5, score=0.809100 -   6.0s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=8, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=8, seed=0, subsample=0.5, score=0.817307 -   6.1s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=8, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=8, seed=0, subsample=0.5, score=0.770932 -   6.3s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=10, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=25, max_depth=8, seed=0, 

[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed: 13.0min


[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=10, max_depth=10, seed=0, subsample=0.5, score=0.817222 -   6.0s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=10, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=10, seed=0, subsample=0.5, score=0.807429 -   7.2s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=10, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=10, seed=0, subsample=0.5, score=0.769169 -   7.0s
[CV] learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=20, max_depth=10, seed=0, subsample=0.5 
[CV]  learning_rate=0.3, colsample_bytree=0.5, objective=multi:softprob, gamma=0, n_estimators=15, max_depth=10, se

[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed: 18.5min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'learning_rate': [0.3], 'colsample_bytree': [0.5], 'objective': ['multi:softprob'], 'gamma': [0], 'n_estimators': [10, 15, 20, 25], 'subsample': [0.5], 'seed': [0], 'max_depth': [6, 8, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(ndcg_score, needs_proba=True, k=5), verbose=10)

In [11]:
print("eXtreme Gradient Boosting Best Score found:")
print(gridSearchXGB.best_score_)
print("eXtreme Gradient Boosting Best set found:")
print(gridSearchXGB.best_params_)

eXtreme Gradient Boosting Best Score found:
0.802158514839
eXtreme Gradient Boosting Best set found:
{'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}


## Model 3 : SVM


In [None]:
# Define parameters to tune the model
tune_parameters = {
    'max_depth': [4, 6,8],
    'learning_rate': [0.1, 0.2, 0.3],
    'n_estimators': [20, 30, 40, 60, 80, 100],
    'objective': ['multi:softprob'],
    'gamma': [0],
    'subsample': [0.5],
    'colsample_bytree': [0.5],
    'seed': [0]
}

# Define eXtreme Gradient Boosting model
model = XGBClassifier()

# Define GridSearch with crossValidation
gridSearchXGB = model_selection.GridSearchCV(model, tune_parameters, cv=cv,scoring=metrics_helper.ndcg_scorer, n_jobs=3, verbose = 10)

# Fit model to data
gridSearchXGB.fit(X_train, y_labels)

## Voting
Now we are going to vote between the 2 models optimized with their best parameters

In [12]:
# Create the sub models
estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=gridSearchRandomForest.best_estimator_.max_depth, 
                                         n_estimators=gridSearchRandomForest.best_estimator_.n_estimators)
estimators.append(('random_forest', model1))

model2 = XGBClassifier(max_depth=gridSearchXGB.best_estimator_.max_depth, 
                       learning_rate=gridSearchXGB.best_estimator_.learning_rate,
                      n_estimators= gridSearchXGB.best_estimator_.n_estimators,
                      objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)
estimators.append(('xgb', model2))
# Create Voting classifier
finalModel1 = ensemble.VotingClassifier(estimators,voting='soft')
results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer, verbose = 10)
print("Voting Classifier Cross Validation Score found:")
print(results.mean())

0.802676551693


## Predict countries from model

In [17]:
finalModel1.fit(X_train,y_labels)
y_pred1 = finalModel1.predict_proba(X_test)  
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)

ctsSubmission1 = label_enc.inverse_transform(cts1)

In [20]:
finalModel2 = model2
finalModel2.fit(X_train,y_labels)
y_pred2 = finalModel.predict_proba(X_test)  
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)

ctsSubmission2 = label_enc.inverse_transform(cts2)

## Convert to csv for submission

In [18]:
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv('submission_country_dest1.csv',index=False)

In [21]:
df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv('submission_country_dest2.csv',index=False)

Results with :
    
    - **Voting (model1, model2) : 0.86735**
    Random Forest:
        Random Forest Best Score found: 0.795119516323
        Random Forest Best parameters set found:
        {'n_estimators': 50, 'max_depth': 8} 
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
    Voting Classifier:
        Voting Classifier Cross Validation Score found: 0.802676551693

    
    - **model2 : 0.86735**
    XGB:
        eXtreme Gradient Boosting Best Score found: 0.802158514839
        {'learning_rate': 0.3, 'colsample_bytree': 0.5, 'objective': 'multi:softprob', 'gamma': 0, 'n_estimators': 20, 'subsample': 0.5, 'seed': 0, 'max_depth': 6}
