In [1]:
import os
import zipfile
import pandas as pd
import numpy as np
from datetime import date


In [2]:
# will including session data improve results?
# read session data

def zip_to_df(filepath):
    zfile = zipfile.ZipFile(filepath)
    for finfo in zfile.infolist():
        ifile = zfile.open(finfo)
        df = pd.read_csv(ifile)
    return df

train_file = './train_users_2.csv.zip'
session_file = './sessions.csv.zip'

ses = zip_to_df(session_file)
df = zip_to_df(train_file)


In [3]:
# get value counts for actions
# group by user_id
ses_grouped = ses.groupby('user_id')

act = pd.DataFrame(ses_grouped['action'].count())
act_type = pd.DataFrame(ses_grouped['action_type'].value_counts()).unstack()
act_det = pd.DataFrame(ses_grouped['action_detail'].value_counts()).unstack()
sec = pd.DataFrame(ses_grouped['secs_elapsed'].sum())

# join grouped data together
ses_counts = act.join(act_type).join(act_det).join(sec)

# replace nulls with 0
ses_counts.fillna(0, inplace=True)

# convert index to column
ses_counts.reset_index(level=0, inplace=True)
ses_counts = ses_counts.rename(columns={'user_id': 'id'})

# set column names
cols = []
for col in ses_counts.columns:
    if type(col) is tuple:
        cols.append(col[0] + '-' + col[1])
    else:
        cols.append(col)
        
ses_counts.columns = cols




In [4]:
# Clean up user data 

df['gender'].replace('-unknown-', 'UNKNOWN', inplace=True)
df['gender'].replace(np.NaN, 'UNKNOWN', inplace=True)

# convert timestamp_first_active from string to datetime
df['timestamp_first_active'] = pd.to_datetime(df['timestamp_first_active'], format='%Y%m%d%H%M%S')

# set datetime values to datetime types
df['date_account_created'] = pd.to_datetime(df['date_account_created'])
df['date_first_booking'] = pd.to_datetime(df['date_first_booking'])


# the 'age' column has some extreme outliers and lots of missing values
# create a function for correcting values in age column, then apply
def check_age(age):
    if age in (np.nan, np.NaN):
        return np.NaN
    elif age >= 1000:
        # if age > 1000, I assume that the age value is actually birth year
        # subtract birth year from current year to calculate age
        return date.today().year - age
    else:
        return age
    
df['age'] = df['age'].apply(check_age)

# also add an age_group column

def age_group(age):
    if age in range(14):
        return '0-14'
    elif age in range(14,19):
        return '14-18'
    elif age in range(19, 26):
        return '19-25'
    elif age in range(26,36):
        return '26-35'
    elif age in range(36,51):
        return '36-50'
    elif age in range(51, 66):
        return '51-65'
    elif age > 65:
        return '66+'
    
df['age_group'] = df.age.apply(age_group)
df['age_group'].fillna('Unknown', inplace=True)
    
# calculate number of days to book 
df['days_to_book'] = df[df.country_destination != 'NDF'].date_first_booking - df[df.country_destination != 'NDF'].date_account_created
df['days_to_book'] = df.days_to_book.dt.days


In [5]:
# join user data with session data
merged = df.merge(ses_counts, how='inner', on='id')


In [6]:
# create target and feature sets
y_ses = merged.country_destination

X_session = merged.drop(['id', 'country_destination', 'date_account_created', 'timestamp_first_active', 
            'date_first_booking'], axis=1)

X_ses_onehot = pd.get_dummies(X_session)


In [7]:
# fill in NAs in data
X_ses_onehot.age = X_ses_onehot.age.fillna(X_ses_onehot.age.mean())
X_ses_onehot.days_to_book = X_ses_onehot.days_to_book.fillna(100000)

In [8]:
# create a test / train split. This is temporary, as our dataset includes a test set to test against
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from scipy import stats
from sklearn.metrics import classification_report

random_state = 1

X_ses_train, X_ses_test, y_ses_train, y_ses_test = train_test_split(
    X_ses_onehot, y_ses, test_size=0.1, random_state=random_state)

# fit the training data
mlp = MLPClassifier(random_state=random_state)
mlp.fit(X_ses_train, y_ses_train)
y_ses_pred = mlp.predict(X_ses_test)
# scoring
print('Score:\n', mlp.score(X_ses_test, y_ses_test))
# classification report
print(classification_report(y_ses_test, y_ses_pred))

# when including session data, our accuracy actually decreases slightly
# session data is only avalable for 74k of the 230k total users IDs

Score:
 0.809672175562
             precision    recall  f1-score   support

         AU       0.00      0.00      0.00        13
         CA       0.00      0.00      0.00        48
         DE       0.00      0.00      0.00        19
         ES       0.00      0.00      0.00        72
         FR       0.00      0.00      0.00       145
         GB       0.00      0.00      0.00        65
         IT       0.00      0.00      0.00        88
        NDF       0.99      1.00      0.99      4480
         NL       0.00      0.00      0.00        23
         PT       0.00      0.00      0.00         7
         US       0.74      0.66      0.70      2055
      other       0.14      0.40      0.21       367

avg / total       0.81      0.81      0.81      7382



  'precision', 'predicted', average, warn_for)


In [9]:
# if we reconfigure as a 3-category classification problem, do we get better results?
def country_group(cn):
    if cn in ('NDF','US'):
        return cn
    else:
        return 'other'

# 3-category target
y_ses3 = y_ses.apply(country_group)

# create train/test split
X_ses3_train, X_ses3_test, y_ses3_train, y_ses3_test = train_test_split(
    X_ses_onehot, y_ses3, test_size=0.1, random_state=random_state)

# using MLP
mlp_3 = MLPClassifier()
# fit the training data
mlp_3.fit(X_ses3_train, y_ses3_train)
# scoring
y_ses3_pred = mlp_3.predict(X_ses3_test)

# results
print('score:', mlp_3.score(X_ses3_test, y_ses3_test))
print('classification report (MLP):\n', 
      classification_report(y_ses3_test, y_ses3_pred))

# again, results are actually less accurate than the 3-category classification
# without the Session data included

score: 0.881197507451
classification report (MLP):
              precision    recall  f1-score   support

        NDF       0.99      1.00      1.00      4480
         US       0.71      0.98      0.82      2055
      other       0.23      0.00      0.01       847

avg / total       0.83      0.88      0.83      7382



In [20]:
ses_counts

Unnamed: 0,id,action,action_type--unknown-,action_type-booking_request,action_type-booking_response,action_type-click,action_type-data,action_type-message_post,action_type-modify,action_type-partner_callback,...,action_detail-view_search_results,action_detail-view_security_checks,action_detail-view_user_real_names,action_detail-wishlist,action_detail-wishlist_content_update,action_detail-wishlist_note,action_detail-your_listings,action_detail-your_reservations,action_detail-your_trips,secs_elapsed
0,00023iyk9l,39,0.0,1.0,0.0,4.0,9.0,1.0,0.0,1.0,...,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,867896.0
1,0010k6l0om,63,5.0,0.0,0.0,16.0,9.0,0.0,0.0,1.0,...,10.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,586543.0
2,001wyh0pz8,90,6.0,0.0,0.0,66.0,2.0,0.0,0.0,0.0,...,66.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,282965.0
3,0028jgx1x1,31,1.0,0.0,0.0,9.0,5.0,0.0,0.0,0.0,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,297010.0
4,002qnbzfs5,782,184.0,1.0,0.0,140.0,140.0,16.0,0.0,0.0,...,125.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6487080.0
5,0031awlkjq,8,4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39574.0
6,0035hobuyj,486,9.0,0.0,0.0,206.0,41.0,3.0,0.0,0.0,...,200.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,5724670.0
7,00378ocvlh,74,20.0,0.0,0.0,5.0,4.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1827164.0
8,00389675gq,175,23.0,0.0,0.0,22.0,34.0,2.0,0.0,0.0,...,14.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,2605711.0
9,003iamz20l,163,7.0,0.0,0.0,47.0,43.0,0.0,0.0,0.0,...,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1590236.0


In [11]:
# Does Random Forest do a better job when the session data is included?
from sklearn.ensemble import RandomForestClassifier

# fit the training data
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_ses_train, y_ses_train)
y_ses_pred_rf = rf.predict(X_ses_test)
# scoring
print('Score:\n', rf.score(X_ses_test, y_ses_test))
# classification report
print(classification_report(y_ses_test, y_ses_pred_rf))


Score:
 0.884584123544
             precision    recall  f1-score   support

         AU       0.00      0.00      0.00        13
         CA       0.00      0.00      0.00        48
         DE       0.00      0.00      0.00        19
         ES       0.00      0.00      0.00        72
         FR       0.00      0.00      0.00       145
         GB       0.00      0.00      0.00        65
         IT       0.00      0.00      0.00        88
        NDF       1.00      1.00      1.00      4480
         NL       0.00      0.00      0.00        23
         PT       0.00      0.00      0.00         7
         US       0.71      1.00      0.83      2055
      other       0.00      0.00      0.00       367

avg / total       0.80      0.88      0.84      7382



  'precision', 'predicted', average, warn_for)


In [14]:
ses

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831.0
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0
