### Imports

In [92]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

In [93]:
#read in data that we cleaned
df = pd.read_csv('cleaned_dataframe.csv')

In [94]:
#sanity check
df

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,lat_destination,lng_destination,distance_km,destination_km2,destination_language,language_levenshtein_distance
0,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,1,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US,36.966427,-95.84403,0.000,9826675.0,eng,0.0
1,87mebub9p4,2010-09-14,20091208061105,2010-02-18,3,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US,36.966427,-95.84403,0.000,9826675.0,eng,0.0
2,osr2jwljor,2010-01-01,20100101215619,2010-01-02,3,33.0,basic,0,en,other,other,omg,Web,Mac Desktop,Chrome,US,36.966427,-95.84403,0.000,9826675.0,eng,0.0
3,lsw9q7uk0j,2010-01-02,20100102012558,2010-01-05,1,46.0,basic,0,en,other,craigslist,untracked,Web,Mac Desktop,Safari,US,36.966427,-95.84403,0.000,9826675.0,eng,0.0
4,0d01nltbrs,2010-01-03,20100103191905,2010-01-13,1,47.0,basic,0,en,direct,direct,omg,Web,Mac Desktop,Safari,US,36.966427,-95.84403,0.000,9826675.0,eng,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77300,egikxlfmh8,2014-06-26,20140626022743,2014-06-26,1,32.0,basic,0,en,seo,google,linked,Web,Windows Desktop,Chrome,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
77301,oitccptcw6,2014-06-26,20140626065256,2014-07-28,0,48.0,basic,0,en,sem-brand,google,untracked,Web,Mac Desktop,Safari,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
77302,86s1bo34bg,2014-06-30,20140630023309,2015-06-29,1,40.0,facebook,0,en,sem-brand,google,omg,Web,Mac Desktop,Safari,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0
77303,ms2453k9vj,2014-06-30,20140630052141,2014-08-22,1,63.0,basic,0,en,seo,google,omg,Web,Windows Desktop,Chrome,AU,-26.853388,133.27516,15297.744,7741220.0,eng,0.0


### Feature Engineering

We chose the below features, and decided to binarize the country_destination column. The country_destination column was skewed to the US, so we created a binary version of this column of 1 = US, 0= any other country within the list, or NDF/Other.

In [95]:
# creating feature list
feat_list = ['id', 'age', 'gender', 'signup_method', 'affiliate_provider', 'distance_km', 'destination_km2', 'destination_language ', 'language_levenshtein_distance', 'country_destination']

In [96]:
# new dataframe
df_mf = df[feat_list].copy()

In [97]:
df_mf

Unnamed: 0,id,age,gender,signup_method,affiliate_provider,distance_km,destination_km2,destination_language,language_levenshtein_distance,country_destination
0,4ft3gnwmtx,56.0,1,basic,direct,0.000,9826675.0,eng,0.0,US
1,87mebub9p4,41.0,3,basic,direct,0.000,9826675.0,eng,0.0,US
2,osr2jwljor,33.0,3,basic,other,0.000,9826675.0,eng,0.0,US
3,lsw9q7uk0j,46.0,1,basic,craigslist,0.000,9826675.0,eng,0.0,US
4,0d01nltbrs,47.0,1,basic,direct,0.000,9826675.0,eng,0.0,US
...,...,...,...,...,...,...,...,...,...,...
77300,egikxlfmh8,32.0,1,basic,google,15297.744,7741220.0,eng,0.0,AU
77301,oitccptcw6,48.0,0,basic,google,15297.744,7741220.0,eng,0.0,AU
77302,86s1bo34bg,40.0,1,facebook,google,15297.744,7741220.0,eng,0.0,AU
77303,ms2453k9vj,63.0,1,basic,google,15297.744,7741220.0,eng,0.0,AU


In [98]:
df_mf.shape

(77305, 10)

In [99]:
df_mf

Unnamed: 0,id,age,gender,signup_method,affiliate_provider,distance_km,destination_km2,destination_language,language_levenshtein_distance,country_destination
0,4ft3gnwmtx,56.0,1,basic,direct,0.000,9826675.0,eng,0.0,US
1,87mebub9p4,41.0,3,basic,direct,0.000,9826675.0,eng,0.0,US
2,osr2jwljor,33.0,3,basic,other,0.000,9826675.0,eng,0.0,US
3,lsw9q7uk0j,46.0,1,basic,craigslist,0.000,9826675.0,eng,0.0,US
4,0d01nltbrs,47.0,1,basic,direct,0.000,9826675.0,eng,0.0,US
...,...,...,...,...,...,...,...,...,...,...
77300,egikxlfmh8,32.0,1,basic,google,15297.744,7741220.0,eng,0.0,AU
77301,oitccptcw6,48.0,0,basic,google,15297.744,7741220.0,eng,0.0,AU
77302,86s1bo34bg,40.0,1,facebook,google,15297.744,7741220.0,eng,0.0,AU
77303,ms2453k9vj,63.0,1,basic,google,15297.744,7741220.0,eng,0.0,AU


### Dummify & Binarize US vs other data

In [100]:
#creating dummies
df_mf['country_destination'] = df_mf['country_destination'].map(
    {'AU': 0, 'CA': 0, 'DE': 0, 'ES': 0,
     'FR': 0, 'GB': 0, 'IT': 0, 'NL': 0,
    'PT': 0, 'US': 1})

In [101]:
df_mf['country_destination'].value_counts()

1    61196
0    16109
Name: country_destination, dtype: int64

In [102]:
df_mf.shape

(77305, 10)

In [103]:
dummy_list = ['gender','signup_method', 'affiliate_provider', 'destination_language ']

In [104]:
df_mfd = pd.get_dummies(df_mf, columns=dummy_list
                        , drop_first = True)

In [105]:
df_mfd.shape

(77305, 32)

### Modelling

# Model 1: Logistic Regression

This model is not overfit, with training/testing scores at about .98 each and strong accuracy score. 

This beats the baseline of .79 as well. 

In [106]:
# Define features X and target y
X = df_mfd.drop(columns=['country_destination','id'])
y = df_mfd['country_destination']
# Train test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [107]:
# Instantiate and fit model
logr = LogisticRegression(solver='liblinear')
logr.fit(X_train, y_train)
# Get our scores
print('Cross val score:', cross_val_score(logr, X_train, y_train, cv=5).mean())
print('Training score:', logr.score(X_train, y_train))
print('Testing score:', logr.score(X_test, y_test))

Cross val score: 0.981510220249626
Training score: 0.9815102280175239
Testing score: 0.9832876287059554


In [108]:
from sklearn.metrics import recall_score, precision_score
preds = logr.predict(X)
print('accuracy:', logr.score(X, y))
print('recall:', recall_score(y, preds))
print('precision:', precision_score(y, preds))

accuracy: 0.9819545954336718
recall: 1.0
precision: 0.9777124506718218


## Baseline score 
- if we were to predict US for every single classification, we would be correct 79% of the time. This means we are looking to beat this score so the model is actually useful to us!

In [109]:
y_train.value_counts(normalize=True)

1    0.79218
0    0.20782
Name: country_destination, dtype: float64

In [110]:
# creating new data frame
y_preds_test = logr.predict(X_test)
cm = confusion_matrix(y_test, y_preds_test)
tn, fp, fn, tp = cm.ravel()
df_pred = pd.DataFrame(cm, columns=['Predicted 0', 'Predicted 1'], index=['Actual 0', 'Actual 1'])

In [111]:
df_pred

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3737,323
Actual 1,0,15267


# Model 2: Random Forest

As we can see there is again, little evidence of overfitting with a train test score of 1/1 aka perfect accuracy overall. This significantly beats the baseline accuracy of .79.

Not a lot of downfalls of this model given the high accuracy. And given the fit and accuracy, this is selected as our production model.

In [112]:
# Define features X and target y
X = df_mfd.drop(columns=['country_destination', 'id'])
y = df_mfd['country_destination']


In [113]:
# Train test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
ss = StandardScaler()
Z_train = ss.fit_transform(X_train)
Z_test = ss.transform(X_test)


In [114]:
# Instantiate and fit model
rf = RandomForestClassifier()
rf.fit(Z_train, y_train)


RandomForestClassifier()

In [115]:
# Get our scores
print('Cross val score:', cross_val_score(rf, Z_train, y_train, cv=5).mean())
print('Training score:', rf.score(Z_train, y_train))
print('Testing score:', rf.score(Z_test, y_test))
from sklearn.metrics import recall_score, precision_score
rf_preds = rf.predict(X)
print('accuracy:', rf.score(X, y))
print('recall:', recall_score(y, rf_preds))
print('precision:', precision_score(y, rf_preds))
y_train.value_counts(normalize=True)

Cross val score: 1.0
Training score: 1.0
Testing score: 1.0
accuracy: 0.2083823814759718
recall: 0.0
precision: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


1    0.79218
0    0.20782
Name: country_destination, dtype: float64

# Conclusion
Problem Statement: Can we accurately predict where people will make their AirBnB booking: The US or not?

We chose our features based on what we know about the travel industry and our best educated guess on what demographics would impact that decision. 

As we cleaned and explored the data, we chose 2 classifier models: Logistic Regression and Random Forest Classifier. The logistic regression was highly successful with a testing and training score of .98. However, the Random Forest model was much more successful with a 100% accuracy score. The method by which the random forest samples, called boostrapping, may help with the bias/variance trade-off and have helped given us our high accuracy scores.


# Further Steps 

With additional time we would:

-- look into further features 

-- spend more time understanding and cleaning the session dataframe for inclusion atleast in EDA

-- include the NDF results as a classifier


This could allow us to make predictions as to why certain groups did not select a destination.