In [18]:
# import modules

import pandas as pd
import numpy as np
import xgboost as xgb
import datetime as dt
import matplotlib
import matplotlib.pyplot as plt
import graphviz

In [3]:
# read in data
# concatenate test and train into df_all
# set aside all of the IDs for later

df_train = pd.read_csv('../kaggle/train_users_2.csv')
df_test = pd.read_csv('../kaggle/test_users.csv')
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)
country = df_train['country_destination'].values
id_test = df_test['id']
print country

['NDF' 'NDF' 'US' ..., 'NDF' 'NDF' 'NDF']


In [4]:
#get sum of null values in each of the columns so we can start replacing them
df_all.isnull().sum()

affiliate_channel               0
affiliate_provider              0
age                        116866
country_destination         62096
date_account_created            0
date_first_booking         186639
first_affiliate_tracked      6085
first_browser                   0
first_device_type               0
gender                          0
id                              0
language                        0
signup_app                      0
signup_flow                     0
signup_method                   0
timestamp_first_active          0
dtype: int64

In [5]:
#fill null ages with mean age
df_all.age.fillna(df_all.age.mean(), inplace=True)

In [6]:
#looking at gender data
df_all.gender.describe()

count        275547
unique            4
top       -unknown-
freq         129480
Name: gender, dtype: object

In [7]:
#drop id, date of first booking, and date account created from df_all
df_all = df_all.drop(['id', 'date_first_booking','date_account_created'], axis=1)

In [8]:
#splitting the timestamp into year/month/date/hour

df_timestring = df_all.timestamp_first_active.astype(str)

df_timestring_year = df_timestring.str[:4]
df_timestring_month = df_timestring.str[4:6]
df_timestring_day = df_timestring.str[6:8]
df_timestring_hour = df_timestring.str[8:10]

df_all['time_year'] = df_timestring_year
df_all['time_month'] = df_timestring_month
df_all['time_day'] = df_timestring_day
df_all['time_hour'] = df_timestring_hour

df_all = df_all.drop(['timestamp_first_active'], axis=1)
df_all.head(10)

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,first_affiliate_tracked,first_browser,first_device_type,gender,language,signup_app,signup_flow,signup_method,time_year,time_month,time_day,time_hour
0,direct,direct,47.14531,NDF,untracked,Chrome,Mac Desktop,-unknown-,en,Web,0,facebook,2009,3,19,4
1,seo,google,38.0,NDF,untracked,Chrome,Mac Desktop,MALE,en,Web,0,facebook,2009,5,23,17
2,direct,direct,56.0,US,untracked,IE,Windows Desktop,FEMALE,en,Web,3,basic,2009,6,9,23
3,direct,direct,42.0,other,untracked,Firefox,Mac Desktop,FEMALE,en,Web,0,facebook,2009,10,31,6
4,direct,direct,41.0,US,untracked,Chrome,Mac Desktop,-unknown-,en,Web,0,basic,2009,12,8,6
5,other,other,47.14531,US,omg,Chrome,Mac Desktop,-unknown-,en,Web,0,basic,2010,1,1,21
6,other,craigslist,46.0,US,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,2010,1,2,1
7,direct,direct,47.0,US,omg,Safari,Mac Desktop,FEMALE,en,Web,0,basic,2010,1,3,19
8,other,craigslist,50.0,US,untracked,Safari,Mac Desktop,FEMALE,en,Web,0,basic,2010,1,4,0
9,other,craigslist,46.0,US,omg,Firefox,Mac Desktop,-unknown-,en,Web,0,basic,2010,1,4,2


In [9]:
#Setting up X values with just the features we want (will add more features as we go forward)
# affiliate_channel_dummies, affiliate_provider_dummies, age, gender_dummies = 85.7
# Added first browser 86.282
# Added first device 86.270
# Added Time 86.440
# Added language, app, flow, methods 87.004

affiliate_channel_dummies = pd.get_dummies(df_all.affiliate_channel, prefix='af_channel')
affiliate_provider_dummies = pd.get_dummies(df_all.affiliate_provider, prefix='af_provider')
first_browser_dummies = pd.get_dummies(df_all.first_browser, prefix='first_browser')
first_device_type_dummies = pd.get_dummies(df_all.first_device_type, prefix='first_device_type')
language_dummies = pd.get_dummies(df_all.language, prefix='language')
signup_app_dummies = pd.get_dummies(df_all.signup_app, prefix='signup_app')
signup_flow_dummies = pd.get_dummies(df_all.signup_flow, prefix='signup_flow')
signup_method_dummies = pd.get_dummies(df_all.signup_method, prefix='signup_method')
gender_dummies = pd.get_dummies(df_all.gender, prefix='gender')
age = df_all.age
X = pd.concat([affiliate_channel_dummies, affiliate_provider_dummies, age, gender_dummies, first_browser_dummies, first_device_type_dummies, df_timestring_year, df_timestring_month, df_timestring_day, df_timestring_hour, signup_method_dummies, signup_flow_dummies, signup_app_dummies, language_dummies], axis=1)
X.head(10)

Unnamed: 0,af_channel_api,af_channel_content,af_channel_direct,af_channel_other,af_channel_remarketing,af_channel_sem-brand,af_channel_sem-non-brand,af_channel_seo,af_provider_baidu,af_provider_bing,...,language_ko,language_nl,language_no,language_pl,language_pt,language_ru,language_sv,language_th,language_tr,language_zh
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Setting up Y Value by using a label encoder to transform the string into an integer
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
Y = le.fit_transform(df_train.country_destination)

print Y.shape

(213451,)


In [11]:
#Setting up X Values
Vals = X.values
X = Vals[:213451]
X_test = Vals[213451:]
X.shape

(213451, 151)

In [12]:
#Running the model- max depth
    #Tuning the parameters
    #max depth 7 - 86.967% accuracy
    #max depth 5 - 87.01
    #n_estimators 25 - 87.01
    #n_estimators 30 - 87.033
    #learning_rate .3 - 87.033 - best !!!
    #learning rate .15 - 86.926
    #learning rate .4 - 87.002
xgbc = xgb.XGBClassifier(max_depth=5, learning_rate=0.3, n_estimators=30,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)
xgbc.fit(X, Y)
y_pred = xgbc.predict_proba(X_test)

In [None]:
#listing top 5 countries for each id by probability
ids = []  #list of ids
cts = []  #list of countries
for i in range(len(id_test)):
    idx = id_test[i]
    ids += [idx] * 5
    cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

In [None]:
#Exporting to CSV
sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
sub.to_csv('sub.csv',index=False)
