# DATA ANALYTICS PROJECT - Airbnb Destinations Revealed

Team Members - Ayushee Nigam, Namha Adukia, Param Awasthi, Pearly Ang, Somanath Tripathy

# Importing data files from local drive

In [71]:
#importing useful libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [72]:
datafile1 = 'age_gender_bkts.csv'
datafile2 = 'countries.csv'
datafile3 = 'sessions.csv'
datafile4 = 'train_users_2.csv'

In [73]:
#user data
user_data = pd.read_csv(datafile1)
#country data
country = pd.read_csv(datafile2)
#session data
session = pd.read_csv(datafile3)
#training data
training_data = pd.read_csv(datafile4)

# Data Preprocessing

In [74]:
#Processing session dataset for device_type with maximum time elapsed
df1=pd.DataFrame(session.groupby(['user_id', 'device_type'])['secs_elapsed'].sum().reset_index())
df3=df1.iloc[df1.groupby('user_id')['secs_elapsed'].agg(pd.Series.idxmax)]


In [75]:
#Merging Session Dataset for device_type and secs_elapsed
training_data=training_data.merge(df3,left_on='id', right_on='user_id', how='left')

In [76]:
#Converting columns to date-time objects
training_data['date_first_booking'] = pd.to_datetime(training_data['date_first_booking'])
training_data['date_account_created'] = pd.to_datetime(training_data['date_account_created'])

In [77]:
#Checking the quantile of secs_elapsed to classify into groups
training_data['secs_elapsed'].quantile([0.25,0.5,0.75])

0.25     249972.5
0.50     835625.0
0.75    1901525.0
Name: secs_elapsed, dtype: float64

In [78]:
# cleaning the data for irrelevant gender and age:
training_data.gender[training_data.gender == 'OTHER'] = '-unknown-'
training_data.age[training_data.age<18] = np.nan
training_data.age[training_data.age>100] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [79]:
# age encoding:
training_data['Age_Over40'] = (training_data.age >39).map({True:1,False:0})
training_data['Age_31-39'] = ((training_data.age<40) & (training_data.age>30)).map({True:1,False:0})
training_data['Age_Under31'] = (training_data.age <31).map({True:1,False:0})
training_data['Age_unknown'] = (training_data.age.isnull()).map({True:1,False:0})

In [80]:
#Encoding secs_elapsed
training_data['secs_elapsed_Over2mn'] = (training_data.secs_elapsed >2000000).map({True:1,False:0})
training_data['secs_elapsed_835k-2mn'] = ((training_data.secs_elapsed<2000000) & (training_data.secs_elapsed>835000)).map({True:1,False:0})
training_data['secs_elapsed_250k-835k'] = ((training_data.secs_elapsed<835000) & (training_data.secs_elapsed>250000)).map({True:1,False:0})
training_data['secs_elapsed_less250k'] = (training_data.secs_elapsed < 250000).map({True:1,False:0})
training_data['secs_elapsed_unknown'] = (training_data.secs_elapsed.isnull()).map({True:1,False:0})

In [81]:
#Checking the ordering of the columns
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Data columns (total 28 columns):
id                         213451 non-null object
date_account_created       213451 non-null datetime64[ns]
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null datetime64[ns]
gender                     213451 non-null object
age                        122958 non-null float64
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser              213451 non-null object
country_destination        213451 non-null object
user_id                    73815 non-null object
device_type               

In [82]:
#rearranging the columns to apply one-hot encoding
c = training_data.columns.tolist()
c.insert(0,c.pop(15))#bringing country_destination to first place        
c.insert(0,c.pop(6))#bringing age to first place
c.insert(0,c.pop(16))#bringing user_id to first place
c.insert(0,c.pop(18))#bringing secs_elapsed to first place
training_data1 = training_data[c]

In [83]:
#checking the ordering for new dataset created
training_data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Data columns (total 28 columns):
secs_elapsed               73815 non-null float64
user_id                    73815 non-null object
age                        122958 non-null float64
country_destination        213451 non-null object
id                         213451 non-null object
date_account_created       213451 non-null datetime64[ns]
timestamp_first_active     213451 non-null int64
date_first_booking         88908 non-null datetime64[ns]
gender                     213451 non-null object
signup_method              213451 non-null object
signup_flow                213451 non-null int64
language                   213451 non-null object
affiliate_channel          213451 non-null object
affiliate_provider         213451 non-null object
first_affiliate_tracked    207386 non-null object
signup_app                 213451 non-null object
first_device_type          213451 non-null object
first_browser             

# One Hot Encoding - Making the data model ready

In [84]:
#one hot encoding to prepare for modelling
en = pd.get_dummies(training_data1.iloc[:,8:19],columns =training_data1.iloc[:,8:19].columns, prefix=list(training_data1.columns[8:19]))

## merging the encoding data set inbetween the 2 train data dsets.
one_zero = pd.concat([training_data1.iloc[:,:8],en,training_data1.iloc[:,19:28]],axis=1)

In [85]:
#checking encoded dataset
en.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Columns: 159 entries, gender_-unknown- to device_type_iPodtouch
dtypes: uint8(159)
memory usage: 34.0 MB


In [86]:
#checking encoded dataset
en.head()

Unnamed: 0,gender_-unknown-,gender_FEMALE,gender_MALE,signup_method_basic,signup_method_facebook,signup_method_google,signup_flow_0,signup_flow_1,signup_flow_2,signup_flow_3,...,device_type_Blackberry,device_type_Chromebook,device_type_Linux Desktop,device_type_Mac Desktop,device_type_Tablet,device_type_Windows Desktop,device_type_Windows Phone,device_type_iPad Tablet,device_type_iPhone,device_type_iPodtouch
0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [87]:
#checking hot encoded final dataset
one_zero.head()

Unnamed: 0,secs_elapsed,user_id,age,country_destination,id,date_account_created,timestamp_first_active,date_first_booking,gender_-unknown-,gender_FEMALE,...,device_type_iPodtouch,Age_Over40,Age_31-39,Age_Under31,Age_unknown,secs_elapsed_Over2mn,secs_elapsed_835k-2mn,secs_elapsed_250k-835k,secs_elapsed_less250k,secs_elapsed_unknown
0,,,,NDF,gxn3p5htnn,2010-06-28,20090319043255,NaT,1,0,...,0,0,0,0,1,0,0,0,0,1
1,,,38.0,NDF,820tgsjxq7,2011-05-25,20090523174809,NaT,0,0,...,0,0,1,0,0,0,0,0,0,1
2,,,56.0,US,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,0,1,...,0,1,0,0,0,0,0,0,0,1
3,,,42.0,other,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,0,1,...,0,1,0,0,0,0,0,0,0,1
4,,,41.0,US,87mebub9p4,2010-09-14,20091208061105,2010-02-18,1,0,...,0,1,0,0,0,0,0,0,0,1


In [88]:
#checking hot encoded final dataset
one_zero.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 213451 entries, 0 to 213450
Columns: 176 entries, secs_elapsed to secs_elapsed_unknown
dtypes: datetime64[ns](2), float64(2), int64(10), object(3), uint8(159)
memory usage: 61.7+ MB


# Final Dataset Obtained - Modelling Begins!

Training dataset split into two, so as to evaluate efficiency if the model created by the first half on the second half 

In [89]:
# test_size = spliting 25% of the data into a test.
# random_state = 1 means everytime you do the split you get the same? test and training.
x_tr,x_ts,y_tr,y_ts = train_test_split(one_zero,one_zero['country_destination'],test_size=0.25,random_state=1)


In [90]:
#checking the training dataset
x_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160088 entries, 72140 to 128037
Columns: 176 entries, secs_elapsed to secs_elapsed_unknown
dtypes: datetime64[ns](2), float64(2), int64(10), object(3), uint8(159)
memory usage: 46.3+ MB


Importing Libraries to be used in 3 different types of models:
  1) Random Forest Classifier
  2) Bernoulli Naïve Bayes
  3) Neural Network

In [33]:
#importing useful libraries for ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score,f1_score,recall_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.neural_network import MLPClassifier

Defined a score function to calculate the accuracy level of the models

In [98]:
def score(true,pred):
    Pres_Score = "Precision_Score:" + str(precision_score(true,pred,average='weighted')) #Precision = TP/TP+FP
    Recall_Score="Recall_Score:" + str(recall_score(true,pred,average='weighted')) #Recall = TP/TP+FN
    F1_Score= "F1_Score:" + str(f1_score(true,pred,average='weighted')) #F1 Score = 2*(Recall * Precision) / (Recall + Precision)
    return (Pres_Score, Recall_Score, F1_Score)

# Random Forest Classifier Model

- Do more trees
- Finetune existing one

In [92]:
#defining RFC as Random Forest Classifier
rfc = RandomForestClassifier()

In [93]:
#fitting RF model into our data
rfc.fit(x_tr.iloc[:,8:175],y_tr)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [94]:
# predicting the outcome on our test dataset
preds = rfc.predict(x_ts.iloc[:,8:175])

In [95]:
#checking our predicted dataset
preds

array(['US', 'NDF', 'NDF', ..., 'NDF', 'NDF', 'NDF'], dtype=object)

In [96]:
# Adding the predicted column on our dataset
x_ts['predicted_country'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [97]:
#calculating the accuracy_score(recall_score)
accuracy_score(x_ts['country_destination'],x_ts['predicted_country'])
score(x_ts['country_destination'],x_ts['predicted_country'])

('Precision_Score:0.542810010445764',
 'Recall_Score:0.6149017109232989',
 'F1_Score:0.5728016383993015')

In [99]:
# Calculatig feature importance based on RF Classifier
featureImportance = pd.DataFrame(rfc.feature_importances_,one_zero.columns[8:175],columns=['feature_importance']).sort_values(["feature_importance"],ascending=False)

In [100]:
featureImportance.iloc[:35,:]

Unnamed: 0,feature_importance
Age_unknown,0.103423
signup_method_facebook,0.04343
Age_31-39,0.042
gender_-unknown-,0.036909
Age_Under31,0.034218
Age_Over40,0.032743
signup_method_basic,0.027768
first_affiliate_tracked_untracked,0.026938
first_affiliate_tracked_linked,0.024524
first_browser_Chrome,0.024419


In [101]:
#Trimming the dataset only to have 35 relevant columns
x_tr,x_ts,y_tr,y_ts = train_test_split(one_zero.loc[:,list(featureImportance[:35].index)],one_zero['country_destination'],test_size=0.25,random_state=1)

In [102]:
# Fitting RFC again on model
rfc = RandomForestClassifier()
rfc.fit(x_tr,y_tr)
preds = rfc.predict(x_ts)

In [103]:
#calculating the accuracy_score for RFC
RF = accuracy_score(y_ts,preds)
RF

0.6167381893821562

Random Forest Model generated an accuracy score of 61.69%

In [104]:
#Other Scores
RF_score=score(y_ts, preds)
RF_score

('Precision_Score:0.5421048097560853',
 'Recall_Score:0.6167381893821562',
 'F1_Score:0.5734662552222307')

# Bernoulli Naïve Bayes Model

In [107]:
# Fitting NB on model
bnb = BernoulliNB()
bnb.fit(x_tr,y_tr)
preds = bnb.predict(x_ts)

In [110]:
#calculating the accuracy_score for NN
BNB = accuracy_score(y_ts,preds)
BNB

0.5652043550774881

Bernoulli Naïve Bayes Model generated an accuracy score of 56.52%

In [111]:
#Other Scores
BNB_score=score(y_ts, preds)
BNB_score

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


('Precision_Score:0.5249771972603676',
 'Recall_Score:0.5652043550774881',
 'F1_Score:0.5387249688490177')

# Neural Network Model

In [112]:
# Fitting NN on model
mc = MLPClassifier()
mc.fit(x_tr,y_tr)
preds = mc.predict(x_ts)

In [113]:
#calculating the accuracy_score for NN
NN = accuracy_score(y_ts,preds)
NN

0.634016078556303

Neural Network Model generated an accuracy score of 63.40%

In [115]:
#Other Scores
NN_score=score(y_ts, preds)
NN_score

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


('Precision_Score:0.5457433832047138',
 'Recall_Score:0.634016078556303',
 'F1_Score:0.5846263687773887')