# Machine Learning 

In [1]:
import pandas as pd
import numpy as np
import os
from IPython.display import Image
from IPython.core.display import HTML 
import matplotlib.pyplot as plt  
% matplotlib inline
import random
from datetime import datetime
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
from scipy.sparse import coo_matrix
import copy
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes # Baseline classification techniques
import scipy.io # Import data
import time

In [2]:
df_train_users = pd.read_csv("train_users2_cleaned.csv")
#df_sessions = pd.read_csv("session_cleaned.csv")
df_test_users = pd.read_csv("test_users_cleaned.csv")
df_train_users.head()

Unnamed: 0.1,Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
1,3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
2,4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,UNKNOWN,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US
3,6,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,FEMALE,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US
4,7,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,FEMALE,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US


## 1. From data frame to matrix : Construct y_train

What we want now for the training is 2 matrices X_train (matrix of relevant features) and y_train (booking dest)

In [3]:
# extract y_train
y_train = df_train_users['country_destination']

# gen var
country_dest = df_train_users['country_destination'].unique()
nb_dest = len(country_dest)
data_len = len(y_train)

# mapping 
country_to_ix = { co:i for i,co in enumerate(country_dest) }
ix_to_country = { i:co for i,co in enumerate(country_dest) }

y_train_ix = [country_to_ix[co] for co in y_train[:data_len]]
y_train = np.array(y_train_ix)

print('\nMapping country to numbers:',country_to_ix)
print('\nMapping numbers to country:',ix_to_country)


Mapping country to numbers: {'FR': 3, 'DE': 8, 'ES': 5, 'PT': 10, 'GB': 6, 'CA': 2, 'US': 0, 'AU': 9, 'NL': 7, 'other': 1, 'IT': 4}

Mapping numbers to country: {0: 'US', 1: 'other', 2: 'CA', 3: 'FR', 4: 'IT', 5: 'ES', 6: 'GB', 7: 'NL', 8: 'DE', 9: 'AU', 10: 'PT'}


In [4]:
# Convert label values to one_hot vector 
one_hot_enable = 0
def convert_to_one_hot(a,max_val=None):
    N = a.size
    data = np.ones(N,dtype=int)
    sparse_out = coo_matrix((data,(np.arange(N),a.ravel())), shape=(N,max_val))
    return np.array(sparse_out.todense())

if one_hot_enable:
    y_train = convert_to_one_hot(y_train,nb_dest)

print(np.size(y_train))
print(y_train)

67163
[0 1 0 ..., 5 0 0]


## 2. From data frame to matrix : Construct X_train

From the cleaned files sessions and train_users we need to choose (relevant) features. 
Ex :
    - number of sessions per id 
    - type of actions 
    - number of actions (number of clicks in total)
    - device 
    - average time per session 
    - etc...

In [5]:
#



#



#

## Run machine learning with X_train and y_train

In [36]:
# Train model example with random forest 
#rand_forest_model = sklearn.ensemble.RandomForestClassifier(n_estimators=100,max_depth=15)
#rand_forest_model.fit(X_train,y_train)

# Predict from model neigh
#y_pred = rand_forest_model.predict(X_test)

# random country for testing 
y_test_len = len(df_test_users)
y = np.random.randint(nb_dest, size=y_test_len)
# ix to country 
y_pred = []
for ix in list(y):
    y_pred.append(ix_to_country[ix])
 


AttributeError: 'list' object has no attribute 'size'

## Convert to csv for submission

In [38]:
df_y_pred = pd.DataFrame(y_pred,columns = ["country"])
print(df_y_pred)
df_id = df_test_users['id']

df_submission =  pd.concat([df_id, df_y_pred] , axis=1)
print(df_submission)

df_submission.to_csv('submission_country_dest.csv')


      country
0          GB
1          US
2          AU
3          DE
4          IT
5          NL
6       other
7          DE
8          GB
9          NL
10      other
11         CA
12      other
13      other
14         AU
15         GB
16         DE
17         FR
18         DE
19         US
20         CA
21      other
22         US
23         US
24         FR
25         DE
26         NL
27         PT
28         CA
29         ES
...       ...
62066      ES
62067      DE
62068      CA
62069   other
62070      GB
62071      PT
62072   other
62073      DE
62074      FR
62075      DE
62076      PT
62077      CA
62078      PT
62079   other
62080      PT
62081      FR
62082      PT
62083      CA
62084      PT
62085      DE
62086      CA
62087      NL
62088      IT
62089      IT
62090      DE
62091      IT
62092      US
62093      FR
62094      ES
62095      IT

[62096 rows x 1 columns]
                0      1
0      5uwns89zht     GB
1      jtl0dijy2j     US
2      xx0ulgorjt     AU
3     