# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import machine_learning_helper as machine_learning_helper

## Read .csv files

In [2]:
df_train_users = pd.read_csv("cleaned_train_user.csv")
df_test_users = pd.read_csv("cleaned_test_user.csv")
df_time_mean_user_id = pd.read_csv("time_mean_user_id.csv")
df_time_total_user_id = pd.read_csv("time_total_user_id.csv")
df_total_action_user_id = pd.read_csv("total_action_user_id.csv")

## Construct sessions data frame

In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
df_sessions.head()

print("X_train has dimension:",df_train_users.shape)
print("X_test has dimension:",df_test_users.shape)

X_train has dimension: (213451, 16)
X_test has dimension: (62096, 15)


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)

## 2. From data frame to matrix : Construct X_train & X_test

In [5]:
df_train_len = df_train_users.shape[0]
df_train = df_train_users.drop(['country_destination'],axis=1)
df_all = pd.concat((df_train_users, df_test_users), axis=0, ignore_index=True)
df_all = pd.merge(df_all, df_sessions, on='id', how='left', left_index=True)

print(df_all['action'].isnull().values.sum())
print(len(df_train['id'].unique()))
print(len(df_train['id']))

141490
213451
213451


In [6]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)

In [7]:
print("X_train has dimension:",X_train.shape)
print("X_test has dimension:",X_test.shape)

X_train['time_mean_user'][137000:137100]

X_train has dimension: (213451, 156)
X_test has dimension: (62096, 156)


134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
134056        -1.000000
48691      27202.611111
129061     25980.250000
16658      75696.266667
86983      24868.211921
17861        365.000000
45649     690144.000000
126226      8278.833333
20245      13453.622222
103174     30580.976190
              ...      
70655       9548.368421
59382       7243.062500
84881       9254.119048
77609      24784.531250
108420     29231.666667
107423      7472.051282
49239      10734.560976
98629      20718.333948
8407       16296.666667
10142      72741.470588
134056        -1

## Learn model from X_train & y_labels

In [8]:
model = machine_learning_helper.trainRandForest(X_train, y_labels)

## Predict countries from model

In [9]:
y_pred = machine_learning_helper.predictCountries(model,X_test)

In [10]:
y_pred = label_enc.inverse_transform(y_pred)

## Convert to csv for submission

In [11]:
df_submission = pd.DataFrame(np.column_stack((df_test_users['id'], y_pred)), columns=['id', 'country'])
df_submission.to_csv('submission_country_dest.csv',index=False)