# Airbnb New User Bookings

## Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

## Loading The DataSet

In [2]:
# Load the data into DataFrames
train_users = pd.read_csv('train_users_2.csv')
test_users = pd.read_csv('test_users.csv')

In [3]:
print("Number of users in training set =", train_users.shape[0] )
print("Number of users in test set =",test_users.shape[0])

Number of users in training set = 213451
Number of users in test set = 62096


In [5]:
train_users.describe(include = 'all')

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
count,213451,213451,213451.0,88908,213451,125461.0,213451,213451.0,213451,213451,213451,207386,213451,213451,213451,213451
unique,213451,1634,,1976,4,,3,,25,8,18,7,4,9,52,12
top,2gywhhthwq,2014-05-13,,2014-05-22,-unknown-,,basic,,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
freq,1,674,,248,95688,,152897,,206314,137727,137426,109232,182717,89600,63845,124543
mean,,,20130850000000.0,,,49.668335,,3.267387,,,,,,,,
std,,,9253717000.0,,,155.666612,,7.637707,,,,,,,,
min,,,20090320000000.0,,,1.0,,0.0,,,,,,,,
25%,,,20121230000000.0,,,28.0,,0.0,,,,,,,,
50%,,,20130910000000.0,,,34.0,,0.0,,,,,,,,
75%,,,20140310000000.0,,,43.0,,0.0,,,,,,,,


In [6]:
test_users.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,-unknown-,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,-unknown-,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,-unknown-,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [7]:
test_users.describe(include = 'all')

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
count,62096,62096,62096.0,0.0,62096,33220.0,62096,62096.0,62096,62096,62096,62076,62096,62096,62096
unique,62096,92,,,4,,4,,24,7,17,7,4,9,31
top,1nig9awllr,2014-07-23,,,-unknown-,,basic,,en,direct,direct,untracked,Web,iPhone,-unknown-
freq,1,1105,,,33792,,45325,,59224,43844,43844,33949,37201,19055,17128
mean,,,20140810000000.0,,,37.616677,,7.813885,,,,,,,
std,,,80245850.0,,,74.440647,,11.254291,,,,,,,
min,,,20140700000000.0,,,1.0,,0.0,,,,,,,
25%,,,20140720000000.0,,,26.0,,0.0,,,,,,,
50%,,,20140810000000.0,,,31.0,,0.0,,,,,,,
75%,,,20140910000000.0,,,40.0,,23.0,,,,,,,


From above, We can see that date_first_booking feature is allways NaN in test dataset so I will remove it from both training and testing.

## Data Cleaning

In [4]:
labels = train_users['country_destination'].values
train_users = train_users.drop(['country_destination', 'date_first_booking'], axis=1)
test_users = test_users.drop(['date_first_booking'], axis=1)
id_test = test_users['id']

# Merge train and test users
all_users = pd.concat((train_users, test_users), axis=0, ignore_index=True)

# Remove ID's since now we are not interested in making predictions
all_users.drop('id',axis=1, inplace=True)

all_users.head()

Unnamed: 0,date_account_created,timestamp_first_active,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,2010-06-28,20090319043255,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome
1,2011-05-25,20090523174809,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome
2,2010-09-28,20090609231247,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE
3,2011-12-05,20091031060129,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox
4,2010-09-14,20091208061105,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome


In [5]:
from datetime import datetime
all_users['date_account_created'] = pd.to_datetime(all_users['date_account_created'])
all_users['timestamp_first_active'] = pd.to_datetime((all_users.timestamp_first_active // 1000000), format='%Y%m%d')

all_users['date_account_created'] = [datetime.timestamp(d) for d in all_users['date_account_created']]
all_users['timestamp_first_active'] = [datetime.timestamp(d) for d in all_users['timestamp_first_active']]

the common age to travel is between 14 and 70. So I will smooth Age distribution by remove all values bellow 14 and above 70.

In [6]:
all_users['age'] = np.where(all_users['age']<=14, 14, all_users['age'])
all_users['age'] = np.where(all_users['age']>=70, 70, all_users['age'])
all_users['age'] = all_users['age'].fillna(all_users['age'].dropna().values.mean())

## Feature Engineering

In [7]:
categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method'
]

# one-hot-encoding
for categorical_feature in categorical_features:
    all_users_dummies = pd.get_dummies(all_users[categorical_feature], prefix=categorical_feature)
    all_users = all_users.drop([categorical_feature], axis=1)
    all_users = pd.concat((all_users, all_users_dummies), axis=1)

In [8]:
from sklearn.preprocessing import LabelEncoder

train_users_n = train_users.shape[0]
X_train = all_users.values[:train_users_n]
le = LabelEncoder()
y_train = le.fit_transform(labels)   
X_test = all_users.values[train_users_n:]

In [None]:
def generate_answer(y_pred, classifer_name):
    #Taking the 5 classes with highest probabilities
    ids = []  #list of ids
    cts = []  #list of countries
    for i in range(len(id_test)):
        idx = id_test[i]
        ids += [idx] * 5
        cts += le.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()
    
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv(classifer_name+'.csv',index=False)
    return sub

## Classification

### XGBClassifier

In [20]:
from xgboost.sklearn import XGBClassifier

xgb = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=25,
                    objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0)                  
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict_proba(X_test)
generate_answer(y_pred_xgb, 'XGB')

Unnamed: 0,id,country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,other
3,5uwns89zht,FR
4,5uwns89zht,IT
5,jtl0dijy2j,NDF
6,jtl0dijy2j,US
7,jtl0dijy2j,other
8,jtl0dijy2j,FR
9,jtl0dijy2j,ES
