# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier


## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

### Feature engineering.
Add 3 features : 
- time_mean_user
- time_total_user
- total_action_user


In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

## Cross validation
5 folds cross validation, using ndcg as scoring metric.


In [None]:
X_train = X_train[1000:80000]
y_labels = y_labels[1000:80000]
X_test = X_test[1000:10000]

# Split train dataset into 5 folds 
cv = model_selection.KFold(n_splits=5, random_state=None)

## Model 1 : RandomForest

Grid Search to find best parameter.

In [None]:
# Define parameters to tune the model
tune_parameters = {'max_depth': [2, 4, 6, 8, 10], 'n_estimators': [10, 50, 100, 150]}

# Define random forest model
model = ensemble.RandomForestClassifier()

# Define GridSearch with crossValidation
gridSearchRandomForest = model_selection.GridSearchCV(model, tune_parameters, cv=cv,scoring=metrics_helper.ndcg_scorer)

# Fit model to data
gridSearchRandomForest.fit(X_train, y_labels)

In [None]:
print("Random Forest Best Score found:")
print(gridSearchRandomForest.best_score_)
print("Random Forest Best parameters set found:")
print(gridSearchRandomForest.best_params_)

## Model 2 : eXtreme Gradient Boosting XCGB

5 folds cross validation, using ndcg as scoring metric.

Grid Search to find best parameter.

In [None]:
# Define parameters to tune the model
#tune_parameters = {'max_depth': [2, 4, 6, 8, 10], 'n_estimators': [10, 50, 100, 1000]}
tune_parameters = {
    'max_depth': [6,8,10],
    'learning_rate': [0.3],
    'n_estimators': [1,15,20,25],
    'objective': ['multi:softprob'],
    'gamma': [0],
    'subsample': [0.5],
    'colsample_bytree': [0.5],
    'seed': [0]
}

# Define eXtreme Gradient Boosting model
model = XGBClassifier()

# Define GridSearch with crossValidation
gridSearchXGB = model_selection.GridSearchCV(model, tune_parameters, cv=cv,scoring=metrics_helper.ndcg_scorer)

# Fit model to data
gridSearchXGB.fit(X_train, y_labels)

In [None]:
print("eXtreme Gradient Boosting Best Score found:")
print(gridSearchXGB.best_score_)
print("eXtreme Gradient Boosting Best set found:")
print(gridSearchXGB.best_params_)

## Voting
Now we are going to vote between the 2 models optimized with their best parameters

In [None]:
# Create the sub models
estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=gridSearchRandomForest.best_estimator_.max_depth, 
                                         n_estimators=gridSearchRandomForest.best_estimator_.n_estimators)
estimators.append(('random_forest', model1))

model2 = XGBClassifier(max_depth=gridSearchXGB.best_estimator_.max_depth, 
                       learning_rate=gridSearchXGB.best_estimator_.learning_rate,
                      n_estimators= gridSearchXGB.best_estimator_.n_estimators,
                      objective='multi:softprob',
                      subsample=0.5, colsample_bytree=0.5, seed=0)
estimators.append(('xgb', model2))
# Create Voting classifier
finalModel = ensemble.VotingClassifier(estimators,voting='soft')
results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer)
print(results.mean())

## Predict countries from model

In [None]:
finalModel.fit(X_train,y_labels)
y_pred = finalModel.predict_proba(X_test)  
id_test = df_test_users['id']
cts,idsubmission = machine_learning_helper.get5likelycountries(y_pred, id_test)

ctsSubmission = label_enc.inverse_transform(cts)

## Convert to csv for submission

In [None]:
df_submission = pd.DataFrame(np.column_stack((idsubmission, ctsSubmission)), columns=['id', 'country'])
df_submission.to_csv('submission_country_dest.csv',index=False)

df_submission = pd.DataFrame(np.column_stack((ids, ctsSubmission)), columns=['id', 'country'])
df_submission.to_csv('submission_country_dest.csv',index=False)