In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
%matplotlib inline


from sklearn.preprocessing import MinMaxScaler
from keras.utils import np_utils
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import *
from sklearn.grid_search import GridSearchCV

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#Load Processed data
train_users = pd.read_csv('../tempData/Processed_Train_User.csv')
session = pd.read_csv('../tempData/Processed_Session.csv')

In [3]:
# Join session and train data
Final = train_users.join(session.set_index('user_id'), on='id')
Final = Final[Final.id.isnull()==False]
Final.update(Final.fillna(-1))
display(Final.head())

Unnamed: 0,id,age,signup_flow,language,affiliate_provider,first_browser,country_destination,dac_year,dac_month,dac_day,...,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone,count,secs_elapsed,secs_elapsed50
0,gxn3p5htnn,-1.0,0,5,4,8,NDF,2010,6,28,...,0,1,0,0,0,0,0,-1.0,-1.0,-1.0
1,820tgsjxq7,38.0,0,5,8,8,NDF,2011,5,25,...,0,1,0,0,0,0,0,-1.0,-1.0,-1.0
2,4ft3gnwmtx,56.0,3,5,4,21,US,2010,9,28,...,0,0,0,0,1,0,0,-1.0,-1.0,-1.0
3,bjjt8pjhuk,42.0,0,5,4,17,other,2011,12,5,...,0,1,0,0,0,0,0,-1.0,-1.0,-1.0
4,87mebub9p4,41.0,0,5,4,8,US,2010,9,14,...,0,1,0,0,0,0,0,-1.0,-1.0,-1.0


In [4]:
"""Metrics to compute the model performance."""

import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import make_scorer
from keras.utils.np_utils import to_categorical


def dcg_score(y_true, y_score, k=5):
    """Discounted cumulative gain (DCG) at rank K.
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth,predictions, k=5):
    """Normalized discounted cumulative gain (NDCG) at rank K.
    """
    #lb = LabelBinarizer()
    #lb.fit(range(len(predictions) + 1))
    #T = lb.transform(ground_truth)
    T=(np.array(to_categorical(ground_truth)))
    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [5]:

trainlabel = Final['country_destination']
titleMap = {'NDF':0, 'US':1, 'other':2, 'FR':3, 'CA':4, 'GB':5 ,'ES':6, 'IT':7, 'PT':8, 'NL':9, 'DE':10, 'AU':11}

trainlabel= trainlabel.map(titleMap)
trainFeatures = (Final.drop('country_destination',axis = 1))

In [6]:
#Normalize the data
scaler = MinMaxScaler()
numericCol = trainFeatures.columns[1:]
trainFeatures[numericCol] = scaler.fit_transform(trainFeatures[numericCol] )
trainFeatures.head()

Unnamed: 0,id,age,signup_flow,language,affiliate_provider,first_browser,dac_year,dac_month,dac_day,tfa_hour,...,first_device_type_Desktop (Other),first_device_type_Mac Desktop,first_device_type_Other/Unknown,first_device_type_SmartPhone (Other),first_device_type_Windows Desktop,first_device_type_iPad,first_device_type_iPhone,count,secs_elapsed,secs_elapsed50
0,gxn3p5htnn,0.0,0.0,0.208333,0.235294,0.156863,0.0,0.454545,0.9,0.173913,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,820tgsjxq7,0.386139,0.0,0.208333,0.470588,0.156863,0.25,0.363636,0.8,0.73913,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4ft3gnwmtx,0.564356,0.12,0.208333,0.235294,0.411765,0.0,0.727273,0.9,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,bjjt8pjhuk,0.425743,0.0,0.208333,0.235294,0.333333,0.25,1.0,0.133333,0.26087,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87mebub9p4,0.415842,0.0,0.208333,0.235294,0.156863,0.0,0.727273,0.433333,0.26087,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:

X, X_test, Y, y_test = train_test_split(trainFeatures, 
                                                    trainlabel, 
                                                    test_size = 0.1, 
                                                    random_state = 0)

X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                    Y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

clf = RandomForestClassifier(random_state=100,n_estimators = 10,min_samples_split=50,criterion = 'entropy')
clf = clf.fit(X_train[X_train.columns[1:]], y_train)


pred = clf.predict_proba(X_valid[X_valid.columns[1:]])
print('Valid : ' ,ndcg_score(y_valid,pred,5))



Valid :  0.8230682027437065


In [8]:
pred = clf.predict_proba(X_test[X_test.columns[1:]])
print('Test : ' ,ndcg_score(y_test,pred,5))

Test :  0.8244925060328597


In [9]:
# XGBClassfier with gridsearchcv(ndcg_scorer)

X, X_test, Y, y_test = train_test_split(trainFeatures, 
                                                    trainlabel, 
                                                    test_size = 0.1, 
                                                    random_state = 0)

X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                    Y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

xgb_model  = XGBClassifier()
parameters = {'max_depth':[8], 
              'learning_rate':[0.1], 
              'n_estimators':[100],
              'objective': ['rank:pairwise'], 
              'subsample':[0.8],
              'colsample_bytree': [0.8],
               'missing' : [None],
               'silent':[True],  
               'seed':[0]
             } 

clf = GridSearchCV(xgb_model, parameters, n_jobs=2, 
                   cv=StratifiedKFold(y_train, n_folds=10, shuffle=True), 
                   scoring=ndcg_scorer,
                   verbose=2, refit=True)         

        
clf = clf.fit(X_train[X_train.columns[1:]], y_train)

pred = clf.predict_proba(X_valid[X_valid.columns[1:]])
ndcg_score(y_valid,pred,5)

pred = clf.predict_proba(X_valid[X_valid.columns[1:]])
print('Valid : ' ,ndcg_score(y_valid,pred,5))


################USE ONLY FOR FINAL RUN################################
pred = clf.predict_proba(X_test[X_test.columns[1:]])
print('Test : ' ,ndcg_score(y_test,pred,5))


Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 
[CV]  colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 -10.2min
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 
[CV]  colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 -10.3min
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=rank:pairwise, seed=0, silent=True, subsample=0.8 
[CV]  col

[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 51.4min finished


Valid :  0.8274657737461266
Test :  0.8286871702696362


In [10]:
X, X_test, Y, y_test = train_test_split(trainFeatures, 
                                                    trainlabel, 
                                                    test_size = 0.1, 
                                                    random_state = 0)

X_train, X_valid, y_train, y_valid = train_test_split(X, 
                                                    Y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

xgb_model  = XGBClassifier()
parameters = {'max_depth':[8], 
              'learning_rate':[0.1], 
              'n_estimators':[100],
              'objective': ['ndcg@5'], 
              'subsample':[0.8],
              'colsample_bytree': [0.8],
               'missing' : [None],
               'silent':[True],  
               'seed':[0]
             } 

clf = GridSearchCV(xgb_model, parameters, n_jobs=2, 
                   cv=StratifiedKFold(y_train, n_folds=10, shuffle=True), 
                   scoring=ndcg_scorer,
                   verbose=2, refit=True)         

        
clf = clf.fit(X_train[X_train.columns[1:]], y_train)

pred = clf.predict_proba(X_valid[X_valid.columns[1:]])
ndcg_score(y_valid,pred,5)

pred = clf.predict_proba(X_valid[X_valid.columns[1:]])
print('Valid : ' ,ndcg_score(y_valid,pred,5))


################USE ONLY FOR FINAL RUN################################


pred = clf.predict_proba(X_test[X_test.columns[1:]])
print('Test : ' ,ndcg_score(y_test,pred,5))

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 
[CV]  colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 -10.2min
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 
[CV]  colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 -10.3min
[CV] colsample_bytree=0.8, learning_rate=0.1, max_depth=8, missing=None, n_estimators=100, objective=ndcg@5, seed=0, silent=True, subsample=0.8 
[CV]  colsample_bytree=0.8, learning_rate=0.1, max_

[Parallel(n_jobs=2)]: Done  10 out of  10 | elapsed: 51.3min finished


Valid :  0.8274657737461266
Test :  0.8286871702696362
