# Assignment 7: Determing the Best Model for Prediciton
###### AirBnb New York Dataset: Prediciting Neighborhood
###### Nichole Page

### Preparing the Data

In [6]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import warnings; warnings.simplefilter('ignore')

In [23]:
# Import the data
df = pd.read_csv('C:/Users/student/Documents/My SAS Files/airbnb.csv')

# Assign input variables
X = df.loc[:,['price','room_type','minimum_nights','number_of_reviews','availability_365']]

# Assign target variable
y = df['neighbourhood_group']

In [24]:
# Change neighbourhood to categorical variable
X['room_type'] = X['room_type'].astype(object)

# Encode categorical variable
X = pd.get_dummies(X)

In [25]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2)

#### Decision Tree

In [26]:
# Create a decision tree and train
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.9284436036404541
Testing Accuracy: 0.48716637693015646


##### Tuning

In [27]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'max_depth': range(1,10), 'criterion':['gini', 'entropy']}

# Create a list of trees
from sklearn.model_selection import GridSearchCV
model = GridSearchCV(DecisionTreeClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'criterion': 'entropy', 'max_depth': 6}
Training Accuracy: 0.5783311177011965
Testing Accuracy: 0.5681562531956232


### Adaboost

In [28]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.5687953778504959
Testing Accuracy: 0.5587483382758973


##### Tuning

In [32]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,20, 2), 'learning_rate':np.linspace(0.001,1,2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(AdaBoostClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 1.0, 'n_estimators': 19}
Training Accuracy: 0.5561662746702116
Testing Accuracy: 0.5470907045710195


### Gradient Boosting

In [33]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.592800899887514
Testing Accuracy: 0.5776664280601288


##### Tuning

In [39]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(1,20, 2), 'learning_rate':np.linspace(0.001,1,3)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(GradientBoostingClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'learning_rate': 0.5005, 'n_estimators': 15}
Training Accuracy: 0.5894007567235914
Testing Accuracy: 0.5761325288884344


### Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.909934553635341
Testing Accuracy: 0.5334901319153288


##### Tuning

In [43]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_estimators': np.arange(2,30, 2), 'max_features':np.arange(2,5)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(RandomForestClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'max_features': 2, 'n_estimators': 26}
Training Accuracy: 0.9264239697310563
Testing Accuracy: 0.5398302484916658


### KNN

In [44]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.6485836997648021
Testing Accuracy: 0.5131404029041824


##### Tuning

In [45]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'n_neighbors': np.arange(2,20, 2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(KNeighborsClassifier(), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'n_neighbors': 18}
Training Accuracy: 0.5899120564474896
Testing Accuracy: 0.530933633295838


### ElasticNet

In [46]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log', penalty='elasticnet', alpha=1, l1_ratio=.1)

model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.5379384395132427
Testing Accuracy: 0.5270477553942121


##### Tuning

In [52]:
# Decide what hyperparameter to tune then decide the searching range
param_grid = {'alpha':np.linspace(0.1,4, 10), 'l1_ratio':np.linspace(0,1,2)}

from sklearn.model_selection import GridSearchCV
model = GridSearchCV(SGDClassifier(loss='log', penalty='elasticnet'), param_grid, cv = 5)
model.fit(x_train, y_train)

# Show the best found paramters
print('Best parameters are:', model.best_params_)

# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Best parameters are: {'alpha': 1.4000000000000001, 'l1_ratio': 0.0}
Training Accuracy: 0.5435371714899274
Testing Accuracy: 0.5341036915840065


### Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(x_train, y_train)
# Accuracry on training data
print('Training Accuracy:', model.score(x_train, y_train))

# Prediction and accuracy on testing data
y_pred = model.predict(x_test)
print('Testing Accuracy:', metrics.accuracy_score(y_pred, y_test))

Training Accuracy: 0.5405460681051232
Testing Accuracy: 0.5314449330197362


###### The best model for this dataset is Gradient Boosting