#### Feature Engineering

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression

In [2]:
train_users = pd.read_csv('Data/train_users_2.csv')

In [3]:
train_users['gender'] = train_users['gender'].replace('-unknown-', np.nan)
train_users['first_browser'] = train_users['first_browser'].replace('-unknown-', np.nan)
train_users['age'] = train_users['age'].apply(lambda x: np.nan if x > 120 else x)

In [4]:
df_train = train_users

In [5]:
session = pd.read_csv('data/sessions.csv')
session.head(15)

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
5,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,7703.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
7,d1mm9tcy42,personalize,data,wishlist_content_update,Windows Desktop,831.0
8,d1mm9tcy42,index,view,view_search_results,Windows Desktop,20842.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0


#### Session Features
The first feature I want to explore is the number of devices a user accesses the app through. My hunch is that if the user uses a lot of devices, it implies s/he travels very often and hence and would be likely to book an Airbnb. An additional hypothesis is that if the person travels a lot, s/he must probably be a business traveler and hence would likely be inclined to book Airbnbs within the United States.

The second feature I want is the total number of seconds the user has spent on Airbnb Sessions. It will be interesting to see how this correlates with out classes.

The third feature is average seconds per session. My assumption is that if someone spends more time per sesion that implies the user is more likly to book.

The fourth feature is total number of sessions. As with total seconds, this might indicate greater interest.

The final feature I want to look at is number of short sessions. These are sessions less than 300 seconds long, it may be an indicator that the user didn't like the search results. 

In [6]:
def session_features(df):
    df['total_seconds'] = df['id'].apply(lambda x: total_seconds[x] if x in total_seconds else 0)
    df['average_seconds'] = df['id'].apply(lambda x: average_seconds[x] if x in average_seconds else 0)
    df['total_sessions'] = df['id'].apply(lambda x: total_sessions[x] if x in total_sessions else 0)
    df['distinct_sessions'] = df['id'].apply(lambda x: distinct_sessions[x] if x in distinct_sessions else 0)
    df['num_short_sessions'] = df['id'].apply(lambda x: num_short_sessions[x] if x in num_short_sessions else 0)
    df['num_long_sessions'] = df['id'].apply(lambda x: num_long_sessions[x] if x in num_long_sessions else 0)
    df['num_devices'] = df['id'].apply(lambda x: num_devices[x] if x in num_devices else 0)
    return df

In [7]:
# This feature is creating flag for Mobile_Safari category in the feature 'first_browser'

def browsers(df):
    df['first_browser'] = df['first_browser'].apply(lambda x: "Mobile_Safari" if x == "Mobile Safari" else x)
    major_browsers = ['Chrome', 'Safari', 'Firefox', 'IE', 'Mobile_Safari']
    df['first_browser'] = df['first_browser'].apply(lambda x: 'Other' if x not in major_browsers else x)
    return df

In [8]:
# # This feature is creating flag for device type

def classify_device(x):
    if x.find('Desktop') != -1:
        return 'Desktop'
    elif x.find('Tablet') != -1 or x.find('iPad') != -1:
        return 'Tablet'
    elif x.find('Phone') != -1:
        return 'Phone'
    else:
        return 'Unknown'

In [9]:
def devices(df):
    df['first_device_type'] = df['first_device_type'].apply(classify_device)
    return df

In [10]:
def affiliate_tracked(df):
    df['first_affiliate_tracked'] = df['first_affiliate_tracked'].fillna('Unknown')
    df['first_affiliate_tracked'] = df['first_affiliate_tracked'].apply(lambda x: 'Other' if x != 'Unknown' and x != 'untracked' else x)
    return df

In [11]:

def affiliate_provider(df):
    df['affiliate_provider'] = df['affiliate_provider'].apply(lambda x: 'rest' if x not in ['direct', 'google', 'other'] else x)
    return df

In [12]:

def affiliate_channel(df):
    df['affiliate_channel'] = df['affiliate_channel'].apply(lambda x: 'other' if x  not in ['direct', 'content'] else x)
    return df

In [13]:

def languages(df):
    df['language'] = df['language'].apply(lambda x: 'foreign' if x != 'en' else x)
    return df

In [14]:
def first_booking(df):
    df = df.drop('date_first_booking', axis=1)
    return df

In [15]:
def account_created(df):
    df = df.drop('date_account_created', axis=1)
    return df

In [16]:
def feature_engineering(df):
    df = session_features(df)
    df = df.drop('age', axis=1)
    df = browsers(df)
    df = devices(df)
    df = affiliate_tracked(df)
    df = affiliate_provider(df)
    df = affiliate_channel(df)
    df = languages(df)
    df['is_3'] = df['signup_flow'].apply(lambda x: 1 if x==3 else 0)
    df = first_booking(df)
    df = df.drop('timestamp_first_active', axis=1)
    df = account_created(df)
    df = df.set_index('id')
    df = pd.get_dummies(df, prefix='is')
    return df

In [17]:
# Explore 'secs_elapsed'
session['secs_elapsed'].describe()

count    1.043171e+07
mean     1.940581e+04
std      8.888424e+04
min      0.000000e+00
25%      2.290000e+02
50%      1.147000e+03
75%      8.444000e+03
max      1.799977e+06
Name: secs_elapsed, dtype: float64

In [18]:
len(session[session['secs_elapsed'].isnull()])

136031

In [19]:
median_secs = session['secs_elapsed'].median()
session['secs_elapsed'] = session['secs_elapsed'].fillna(median_secs)

In [20]:
session['secs_elapsed'].describe()

count    1.056774e+07
mean     1.917078e+04
std      8.833430e+04
min      0.000000e+00
25%      2.370000e+02
50%      1.147000e+03
75%      8.193000e+03
max      1.799977e+06
Name: secs_elapsed, dtype: float64

In [21]:
null_action = session[(session['action_type'].isnull()) | (session['action_detail'].isnull()) | (session['action'].isnull()) ]
null_action.head()

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
6,d1mm9tcy42,lookup,,,Windows Desktop,115.0
9,d1mm9tcy42,lookup,,,Windows Desktop,683.0


In [22]:
null_action.shape

(1205830, 6)

In [23]:
len(null_action['action'].drop_duplicates())

19

In [24]:
session['device_type'] = session['device_type'].replace('-unknown-', np.nan)

In [25]:
# Total time spent by user
total_seconds = session.groupby('user_id')['secs_elapsed'].sum()

In [26]:
# Average time spent by user
average_seconds = session.groupby('user_id')['secs_elapsed'].mean()

In [27]:
# Total session for each user
total_sessions = session.groupby('user_id')['action'].count()

In [28]:
# Count of distinct session action taken by each user
distinct_sessions = session.groupby('user_id')['action'].nunique()

In [29]:
num_short_sessions = session[session['secs_elapsed'] <= 300].groupby('user_id')['action'].count()
num_long_sessions = session[session['secs_elapsed'] >= 2000].groupby('user_id')['action'].count()

In [30]:
# Number of different devices used by each user
num_devices = session.groupby('user_id')['device_type'].nunique()

Aligning session data with user informations.

In [31]:
df_train = session_features(df_train)

In [32]:
df_train.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                      95688
age                         88771
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser               27266
country_destination             0
total_seconds                   0
average_seconds                 0
total_sessions                  0
distinct_sessions               0
num_short_sessions              0
num_long_sessions               0
num_devices                     0
dtype: int64

In [33]:
df_train = df_train.drop('age', axis=1)


In [34]:
df_train.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                      95688
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked      6065
signup_app                      0
first_device_type               0
first_browser               27266
country_destination             0
total_seconds                   0
average_seconds                 0
total_sessions                  0
distinct_sessions               0
num_short_sessions              0
num_long_sessions               0
num_devices                     0
dtype: int64

In [35]:
df_train = browsers(df_train)
df_train = devices(df_train)
df_train = affiliate_tracked(df_train)
df_train = affiliate_provider(df_train)
df_train = affiliate_channel(df_train)
df_train = languages(df_train)

In [36]:
df_train.isnull().sum()

id                              0
date_account_created            0
timestamp_first_active          0
date_first_booking         124543
gender                      95688
signup_method                   0
signup_flow                     0
language                        0
affiliate_channel               0
affiliate_provider              0
first_affiliate_tracked         0
signup_app                      0
first_device_type               0
first_browser                   0
country_destination             0
total_seconds                   0
average_seconds                 0
total_sessions                  0
distinct_sessions               0
num_short_sessions              0
num_long_sessions               0
num_devices                     0
dtype: int64

In [37]:
# In our EDA section, we noticed that people with signup flow 3 had a disproportionate number of conversions. 
# Therefore, we will define an additional feature that identifies these users. We will also revert back our gender 
# feture by filling in the NaNs with Unknown since we deduced that it will lead to an improvement in prediction.

df_train['is_3'] = df_train['signup_flow'].apply(lambda x: 1 if x==3 else 0)
df_train['gender'] = df_train['gender'].fillna('Unknown')

In [38]:
# We will drop the first_bookings and the timsestamp features. This is because they do not add any value when 
# finally testing the model with the test set. All our test users have NaNs as first booking and the timestamp is of no 
# significance.

df_train = first_booking(df_train)
df_train = df_train.drop('timestamp_first_active', axis=1)


In [39]:
# Finally we will drop the accounts created feature as our training and test sets were separated in the middle of 2014. As a result, all test users registered only in 2014.
df_train = account_created(df_train)
df_train = df_train.set_index('id')



In [40]:
df_train.isnull().sum()

gender                     0
signup_method              0
signup_flow                0
language                   0
affiliate_channel          0
affiliate_provider         0
first_affiliate_tracked    0
signup_app                 0
first_device_type          0
first_browser              0
country_destination        0
total_seconds              0
average_seconds            0
total_sessions             0
distinct_sessions          0
num_short_sessions         0
num_long_sessions          0
num_devices                0
is_3                       0
dtype: int64

In [41]:
# Our dataset is now in a position to have one hot encoding performed on it. Let us now separate our X and y data.
class_dict = {
    'NDF': 0,
    'US': 1,
    'other': 2,
    'FR': 3,
    'CA': 4,
    'GB': 5,
    'ES': 6,
    'IT': 7,
    'PT': 8,
    'NL': 9,
    'DE': 10,
    'AU': 11
}

In [42]:
X, y = df_train.drop('country_destination', axis=1), df_train['country_destination'].apply(lambda x: class_dict[x])

In [43]:
X = pd.get_dummies(X, prefix='is')

In [44]:
train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.75, stratify=y)


#### Machine Learning
The next step is to build a classifier to train our data on and then test its performance against the test data. With all the feature engineering already done in the previous step, applying machine learning should be fairly concise.

#### Model Selection
We need to however come up with a classifier that performs the best, given the features. In such competitions, Ensemble Methods give the best results. We will train our model using two classifiers: Logistic Regression and Random Forest and 
choose the one with the best accuracy.

In [45]:
# model = LogisticRegression( dual=False, class_weight= 'balanced' , max_iter=1000, multi_class='ovr', 
#                            verbose=1)
# model.fit(train_X,train_y)
# print("Score: " + str(model.score(test_X, test_y)))


In [46]:

classifiers = [RandomForestClassifier( verbose=1), 
               LogisticRegression(verbose=1)]

for classifier in classifiers:
    classifier.fit(train_X, train_y)
    print("Score: " + str(classifier.score(test_X, test_y)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   42.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Score: 0.6030395592451698
Score: 0.5834754417855068


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   18.1s finished


With the default hyperparameters RandomForest Classifier performs better than Logistic Classifier. And the magin of differene between these two is very less. However, we will go with the Random Forest model for our final submission.

#### Hyperparameter Tuning using Grid Search Cross Validation

In [54]:
parameters = {
    'n_estimators': [100,200],
    'max_features': ['auto', 'log2'],
    'max_depth': [3,5]
}

In [55]:
clf = GridSearchCV(RandomForestClassifier(), parameters, verbose=100)
clf.fit(train_X, train_y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] max_depth=3, max_features=auto, n_estimators=100 ................
[CV]  max_depth=3, max_features=auto, n_estimators=100, score=0.583, total=   7.7s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.6s remaining:    0.0s
[CV] max_depth=3, max_features=auto, n_estimators=100 ................
[CV]  max_depth=3, max_features=auto, n_estimators=100, score=0.583, total=   7.6s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   15.2s remaining:    0.0s
[CV] max_depth=3, max_features=auto, n_estimators=100 ................
[CV]  max_depth=3, max_features=auto, n_estimators=100, score=0.583, total=   7.6s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   22.8s remaining:    0.0s
[CV] max_depth=3, max_features=auto, n_estimators=100 ................
[CV]  max_depth=3, max_features=auto, n_estimators=100, score=0.584, total=   8.1s
[Pa

[CV]  max_depth=5, max_features=log2, n_estimators=200, score=0.588, total=  23.0s
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  7.7min remaining:    0.0s
[CV] max_depth=5, max_features=log2, n_estimators=200 ................
[CV]  max_depth=5, max_features=log2, n_estimators=200, score=0.588, total=  19.0s
[Parallel(n_jobs=1)]: Done  37 out of  37 | elapsed:  8.0min remaining:    0.0s
[CV] max_depth=5, max_features=log2, n_estimators=200 ................
[CV]  max_depth=5, max_features=log2, n_estimators=200, score=0.588, total=  18.9s
[Parallel(n_jobs=1)]: Done  38 out of  38 | elapsed:  8.3min remaining:    0.0s
[CV] max_depth=5, max_features=log2, n_estimators=200 ................
[CV]  max_depth=5, max_features=log2, n_estimators=200, score=0.588, total=  21.5s
[Parallel(n_jobs=1)]: Done  39 out of  39 | elapsed:  8.7min remaining:    0.0s
[CV] max_depth=5, max_features=log2, n_estimators=200 ................
[CV]  max_depth=5, max_features=log2, n_estimators=200, score=0.

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5], 'max_features': ['auto', 'log2'],
                         'n_estimators': [100, 200]},
             verbose=100)

In [56]:
clf.best_estimator_

RandomForestClassifier(max_depth=5)

In [60]:
classifier = RandomForestClassifier( verbose=1,max_features= 'auto' )

classifier.fit(train_X, train_y)
print("Score: " + str(classifier.score(test_X, test_y)))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   52.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Score: 0.6033956111912748


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.5s finished


#### Testing and Final Submission

In [71]:
df_test = pd.read_csv('data/test_users.csv')
df_test['gender'] = df_test['gender'].replace('-unknown-', 'Unknown')
#df_test=df_test.drop('age',axis=1)
id_test = df_test.id
df_test.head()


Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser
0,5uwns89zht,2014-07-01,20140701000006,,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
1,jtl0dijy2j,2014-07-01,20140701000051,,Unknown,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari
2,xx0ulgorjt,2014-07-01,20140701000148,,Unknown,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome
3,6c6puo6ix0,2014-07-01,20140701000215,,Unknown,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE
4,czqhjk3yfe,2014-07-01,20140701000305,,Unknown,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari


In [72]:
df_test = feature_engineering(df_test)
df_test = df_test.drop('is_weibo', axis=1)

In [134]:
pred_prob = classifier.predict_proba(df_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.6s finished


In [135]:
pred_prob = pd.DataFrame(pred_prob, index=df_test.index)
pred_prob.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
5uwns89zht,0.7,0.14,0.01,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0
jtl0dijy2j,0.77,0.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xx0ulgorjt,0.62,0.3,0.04,0.01,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0
6c6puo6ix0,0.88,0.05,0.04,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0
czqhjk3yfe,0.43,0.13,0.21,0.11,0.03,0.0,0.01,0.07,0.0,0.0,0.01,0.0


In [136]:

inv_classes = {v: k for k, v in class_dict.items()}
inv_classes

{0: 'NDF',
 1: 'US',
 2: 'other',
 3: 'FR',
 4: 'CA',
 5: 'GB',
 6: 'ES',
 7: 'IT',
 8: 'PT',
 9: 'NL',
 10: 'DE',
 11: 'AU'}

In [137]:
def get_top(s):
    indexes = [i for i in range(0,12)]
    lst = list(zip(indexes, s))
    top_five = sorted(lst, key=lambda x: x[1], reverse=True)[:2]
    top_five = [inv_classes[i[0]] for i in top_five]
    return str(top_five)

In [138]:
pred_prob['get_top'] = pred_prob.apply(get_top, axis=1)
pred_prob.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,get_top
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
5uwns89zht,0.7,0.14,0.01,0.0,0.0,0.0,0.0,0.15,0.0,0.0,0.0,0.0,"['NDF', 'IT']"
jtl0dijy2j,0.77,0.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"['NDF', 'US']"
xx0ulgorjt,0.62,0.3,0.04,0.01,0.0,0.0,0.01,0.02,0.0,0.0,0.0,0.0,"['NDF', 'US']"
6c6puo6ix0,0.88,0.05,0.04,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,"['NDF', 'US']"
czqhjk3yfe,0.43,0.13,0.21,0.11,0.03,0.0,0.01,0.07,0.0,0.0,0.01,0.0,"['NDF', 'other']"


In [139]:
import ast
pred_prob['get_top'] = pred_prob['get_top'].apply(lambda x: ast.literal_eval(x))

In [140]:
s = pred_prob.apply(lambda x: pd.Series(x['get_top']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'country'

In [141]:
submission = pred_prob.drop([i for i in range(0,12)] + ['get_top'], axis=1).join(s)
submission.head()

Unnamed: 0_level_0,country
id,Unnamed: 1_level_1
0010k6l0om,NDF
0010k6l0om,US
0031awlkjq,NDF
0031awlkjq,FR
00378ocvlh,NDF


In [142]:
submission.to_csv('submission.csv')

Use SHAP library to explain the used model.
Ref : https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d