In [6]:
# set imports
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date


In [11]:
# this dataset provides a pre-split test and training set. 

# read in data

train_file = './train_users_2.csv.zip'
test_file = './test_users.csv.zip'

def zip_to_df(filepath):
    zfile = zipfile.ZipFile(filepath)
    for finfo in zfile.infolist():
        ifile = zfile.open(finfo)
        df = pd.read_csv(ifile)
    return df

df_train = zip_to_df(train_file)
df_test = zip_to_df(test_file)


In [10]:
# Declare functions 

# fix bad age values (convert birth years to ages)
def check_age(age):
    if age in (np.nan, np.NaN):
        return np.NaN
    elif age >= 1000:
        # if age > 1000, I assume that the age value is actually birth year
        # subtract birth year from current year to calculate age
        return date.today().year - age
    else:
        return age

# create an age group column
def age_group(age):
    if age in range(14):
        return '0-14'
    elif age in range(14,19):
        return '14-18'
    elif age in range(19, 26):
        return '19-25'
    elif age in range(26,36):
        return '26-35'
    elif age in range(36,51):
        return '36-50'
    elif age in range(51, 66):
        return '51-65'
    elif age > 65:
        return '66+'

# add a device class columns (desktop, phone, tablet, or unknown)

def get_device_class(val):
    if str(val).lower().find('desktop') > 0:
        return 'Desktop'
    elif str(val).lower().find('phone') > 0:
        return 'Phone'
    elif val in ('iPad', 'Android Tablet'):
        return 'Tablet'
    else:
        return 'Unknown'

In [12]:
# clean up the data in both the training and test sets
# Data clean up. 

df_train['gender'].replace('-unknown-', 'UNKNOWN', inplace=True)
df_train['gender'].replace(np.NaN, 'UNKNOWN', inplace=True)

df_test['gender'].replace('-unknown-', 'UNKNOWN', inplace=True)
df_test['gender'].replace(np.NaN, 'UNKNOWN', inplace=True)

# convert timestamp_first_active from string to datetime
df_train['timestamp_first_active'] = pd.to_datetime(df_train['timestamp_first_active'], format='%Y%m%d%H%M%S')
df_test['timestamp_first_active'] = pd.to_datetime(df_test['timestamp_first_active'], format='%Y%m%d%H%M%S')

# set datetime values to datetime types
df_train['date_account_created'] = pd.to_datetime(df_train['date_account_created'])
df_train['date_first_booking'] = pd.to_datetime(df_train['date_first_booking'])

df_test['date_account_created'] = pd.to_datetime(df_train['date_account_created'])
df_test['date_first_booking'] = pd.to_datetime(df_train['date_first_booking'])


# fix errant age values
df_train['age'] = df_train['age'].apply(check_age)
df_test['age'] = df_test['age'].apply(check_age)

# set age group values
df_train['age_group'] = df_train.age.apply(age_group)
df_train['age_group'].fillna('Unknown', inplace=True)

df_test['age_group'] = df_train.age.apply(age_group)
df_test['age_group'].fillna('Unknown', inplace=True)
    
# calculate number of days to book 
df_train['days_to_book'] = df_train[df_train.country_destination != 'NDF'].date_first_booking - df_train[df_train.country_destination != 'NDF'].date_account_created
df_train['days_to_book'] = df_train.days_to_book.dt.days

df_test['days_to_book'] = df_test[df_test.date_first_booking.notnull()].date_first_booking - df_test[df_test.date_first_booking.notnull()].date_account_created
df_test['days_to_book'] = df_test.days_to_book.dt.days

# add in device category column
df_train['first_device_class'] = df_train['first_device_type'].apply(get_device_class)
df_test['first_device_class'] = df_test['first_device_type'].apply(get_device_class)



In [14]:
# Separate into target and features datasets

# target variable is the destination country
y_train = df_train.country_destination
# the test set does not include the target values

# predictors are every variable except  country_desintation and the date values. 
# (days between date_account_created and date_first_booking is stored as calculated column)
X_train = df_train.drop(['id', 'country_destination', 'date_account_created', 'timestamp_first_active', 
            'date_first_booking'], axis=1)


X_test = df_test.drop(['id', 'date_account_created', 'timestamp_first_active', 
            'date_first_booking'], axis=1)

X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# fill in NAs
X_train.age.fillna(X_train.age.median(), inplace=True)
X_test.age.fillna(X_test.age.median(), inplace=True)

X_train.days_to_book = X_train.days_to_book.fillna(100000)
X_test.days_to_book = X_test.days_to_book.fillna(100000)

# the following columns are not in the X_test set because no values matched
# during one-hot encoding. Dropping from X_test. This should affect the model
# as only a small number of rows fall into these categories
for c in X_test.columns:
    if c not in X_train.columns:
        try:
            X_test = X_test.drop(c, axis=1)
        except:
            pass

for c in X_train.columns:
    if c not in X_test.columns:
        try:
            X_train = X_train.drop(c, axis=1)
        except:
            pass

In [18]:
# okay, both the train and test sets now have the same number of columns
# fit the model 

from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

mlp = MLPClassifier()
# fit the training data
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [26]:
# get predictions and output to a file
y_pred = mlp.predict(X_test)
df_results = pd.DataFrame(
    {
    'id': df_test.id,
    'country': y_pred
    }
)
df_results = df_results[['id', 'country']]
df_results.to_csv('./submission.csv', encoding='utf-8', index=False)