# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import machine_learning_helper as machine_learning_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes # Baseline classification techniques

## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

print(df_all['action'].isnull().values.sum())
print(len(df_train['id'].unique()))
print(len(df_train['id']))

141490
213451
213451


In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

In [7]:
print("X_train has dimension:",X_train.shape)
print("X_test has dimension:",X_test.shape)

#X_train['time_mean_user'][137000:137100]

X_train has dimension: (213451, 156)
X_test has dimension: (62096, 156)


## Learn model from X_train & y_labels

In [8]:
# split train set into folds
batch_size = int(X_train.shape[0]/5)

print(batch_size/X_train.shape[0])
fold1 = X_train[:batch_size] # used for testing
fold2 = X_train[batch_size:] # used for training

model = sklearn.ensemble.RandomForestClassifier(n_estimators=100,max_depth=6)
model.fit(fold2,y_labels[batch_size:])

0.19999906301680478


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=6, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [9]:
#model = machine_learning_helper.trainRandForest(X_train, y_labels)

## Predict countries from model

In [10]:
y_pred = model.predict_proba(fold1)  
id_test = df_train_users['id'][:batch_size]
#Taking the 5 classes with highest probabilities
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += (np.argsort(y_pred[i])[::-1])[:5].tolist()

In [11]:
#test_users_len = len(df_test_users['id'])
#idc,cts = machine_learning_helper.predictCountries(model, X_test, test_users_len)

In [12]:
#cts = label_enc.inverse_transform(cts)

In [24]:
y_true = y_labels[:batch_size]
k = 5
predictions = np.zeros((len(y_true), k))

for i in range(len(y_true)):
    for j in range(k):
        if y_true[i] == cts[i+j]:
            predictions[i,j] = 1
            
score_array = []
for array in predictions:
    score = machine_learning_helper.ndcg_at_k(array, 5 , method = 1)
    score_array.append(score)

score_final =np.mean(score_array)

print(score_final)
score = machine_learning_helper.ndcg_at_k(predictions[-1], 5 , method = 1)
print(score)


[[ 1.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.]
 ..., 
 [ 0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.]
 [ 0.  1.  0.  0.  0.]]
0.559237568064
0.630929753571


## Convert to csv for submission

In [None]:
#df_submission = pd.DataFrame(np.column_stack((df_test_users['id'], y_pred)), columns=['id', 'country'])
#df_submission.to_csv('submission_country_dest.csv',index=False)

In [None]:
df_submission = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
df_submission.to_csv('submission_country_dest.csv',index=False)