
Team 4 / Modelling / D03

In [1]:
import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import joblib

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

Modelling

In [2]:
df = pd.read_csv ('data/preprocessed_data.csv')

In [3]:
X = df.drop(columns='price')
y = df.price
X.shape, y.shape

((9501, 17), (9501,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [5]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6650, 17), (2851, 17), (6650,), (2851,))

In [6]:
# Dummy Model to use as a baseline

dummy = DummyRegressor()

dummy.fit(X_train,y_train)

preds = dummy.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Dummy results: {mae} and {mape}')

Dummy results: 55.28437841358923 and 0.9643676635504734


In [7]:
#Linear Regression

lr = LinearRegression()

lr.fit(X_train,y_train)

preds = lr.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Linear Regression results: {mae} and {mape}')

Linear Regression results: 48.23650953531794 and 0.7778661990180143


In [8]:
# KNN Regression

knn = KNeighborsRegressor()

knn.fit(X_train,y_train)

preds = knn.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'KNN Regression results: {mae} and {mape}')


KNN Regression results: 53.98751315327956 and 0.7891653974951648


In [9]:
# SVM 

svm = SVR()

svm.fit(X_train,y_train)

preds = svm.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'SVM Regression results: {mae} and {mape}')

SVM Regression results: 43.064306120373885 and 0.4538081330422991


In [10]:
#Random Forest

rf = RandomForestRegressor()

rf.fit(X_train,y_train)

preds = rf.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Random Forest results: {mae} and {mape}')


Random Forest results: 44.13826376709926 and 0.6248254995905056


In [11]:
#Gradient Boosting

gb = GradientBoostingRegressor()

gb.fit(X_train,y_train)

preds = gb.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Gradient Boosting results: {mae} and {mape}')


Gradient Boosting results: 48.356448381168825 and 0.7314689310014085


In [12]:
#Ada Boosting

ada = GradientBoostingRegressor()

ada.fit(X_train,y_train)

preds = ada.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'ADA results: {mae} and {mape}')

ADA results: 48.069459256785976 and 0.7259075244248003


Standarization to improve the models

In [13]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [14]:
# SVM in scaled data

svm2 = SVR(kernel='linear' )

svm2.fit(X_train_sc,y_train)

preds = svm2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'SVM Regression scaled results: {mae} and {mape}')

SVM Regression scaled results: 38.446252492612075 and 0.3931985197105078


In [15]:
#Random Forest in scaled data

rf2 = RandomForestRegressor()

rf2.fit(X_train_sc,y_train)

preds = rf2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Random Forest scaled results: {mae} and {mape}')


Random Forest scaled results: 45.538095405121005 and 0.6577979519940482


In [16]:
#Linear Regression in scaled data

lr2 = LinearRegression()

lr2.fit(X_train_sc,y_train)

preds = lr2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Linear Regression scaled results: {mae} and {mape}')

Linear Regression scaled results: 48.20865116784456 and 0.7768305426341179


In [17]:
#Gradient Boosting in scaled data

gb2 = GradientBoostingRegressor()

gb2.fit(X_train_sc,y_train)

preds = gb2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Gradient Boosting scaled results: {mae} and {mape}')

Gradient Boosting scaled results: 48.60583664987096 and 0.7316694610499395


Handling outliers to improve the models

In [18]:
# Describe the prepocessed data set.

df.describe()

Unnamed: 0.1,Unnamed: 0,neighbourhood_cleansed,accommodates,bedrooms,beds,price,minimum_nights,number_of_reviews,Home or apt,Hotel room,Private room,Shared room,Private Bath,Shared Bath,bathrooms_number,amenities,not_a_superhost,superhost
count,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0
mean,4765.6453,58909.771919,3.842227,1.455005,2.103358,82.759815,3.228502,42.717609,0.88717,0.013788,0.091043,0.007999,0.950532,0.049468,1.206978,15.709925,0.600463,0.399537
std,2758.526004,56496.229,1.958101,0.775141,1.55335,289.474447,19.657534,70.161898,0.316402,0.116616,0.287685,0.089084,0.216855,0.216855,0.51603,3.669025,0.489829,0.489829
min,0.0,66.0,1.0,1.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2380.0,16742.0,2.0,1.0,1.0,36.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,14.0,0.0,0.0
50%,4758.0,33086.0,4.0,1.0,2.0,50.0,2.0,12.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,0.0
75%,7143.0,80939.0,4.0,2.0,3.0,80.0,2.0,52.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,19.0,1.0,1.0
max,9581.0,165084.0,16.0,12.0,44.0,8000.0,1000.0,745.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,20.0,1.0,1.0


In [19]:
df.median()

Unnamed: 0                 4758.0
neighbourhood_cleansed    33086.0
accommodates                  4.0
bedrooms                      1.0
beds                          2.0
price                        50.0
minimum_nights                2.0
number_of_reviews            12.0
Home or apt                   1.0
Hotel room                    0.0
Private room                  0.0
Shared room                   0.0
Private Bath                  1.0
Shared Bath                   0.0
bathrooms_number              1.0
amenities                    17.0
not_a_superhost               1.0
superhost                     0.0
dtype: float64

In [20]:
df.shape

(9501, 18)

Since the data doesn’t follow a normal distribution, we will calculate the outlier data points using the statistical method called interquartile range (IQR). Using the IQR, the outlier data points are the ones falling below Q1–1.5 IQR or above Q3 + 1.5 IQR. The Q1 is the 25th percentile and Q3 is the 75th percentile of the dataset, and IQR represents the interquartile range calculated by Q3 minus Q1 (Q3–Q1).

In [22]:
# Replacing outliers as if they were missing values.

def impute_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   upper = df[~(df>(q3+1.5*IQR))].max()

   lower = df[~(df<(q1-1.5*IQR))].min()

   df = np.where(df > upper,

       df.median(),

       np.where(

           df < lower,

           df.median(),

           df

           )

       )

   return df

In [23]:
# New dataframe, called df_no_outliers.

df_no_outliers = impute_outliers_IQR(df)
df_no_outliers = pd.DataFrame(df_no_outliers, columns = df.columns)
df_no_outliers.describe()

Unnamed: 0.1,Unnamed: 0,neighbourhood_cleansed,accommodates,bedrooms,beds,price,minimum_nights,number_of_reviews,Home or apt,Hotel room,Private room,Shared room,Private Bath,Shared Bath,bathrooms_number,amenities,not_a_superhost,superhost
count,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0,9501.0
mean,4765.6453,58909.771919,3.581097,1.377434,1.992211,55.391748,1.680349,22.509736,1.0,0.0,0.0,0.0,1.0,0.0,1.0,15.950742,0.600463,0.399537
std,2758.526004,56496.229,1.369163,0.584404,1.136172,26.975088,0.663343,29.883434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.309587,0.489829,0.489829
min,0.0,66.0,1.0,1.0,0.0,9.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,7.0,0.0,0.0
25%,2380.0,16742.0,2.0,1.0,1.0,36.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,14.0,0.0,0.0
50%,4758.0,33086.0,4.0,1.0,2.0,50.0,2.0,12.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,17.0,1.0,0.0
75%,7143.0,80939.0,4.0,2.0,3.0,68.0,2.0,31.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,19.0,1.0,1.0
max,9581.0,165084.0,7.0,3.0,6.0,146.0,3.0,127.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,20.0,1.0,1.0


Testing the models with the new dataset

In [24]:
X = df_no_outliers.drop(columns='price')
y = df_no_outliers.price
X.shape, y.shape

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6650, 17), (2851, 17), (6650,), (2851,))

In [25]:
#Linear Regression

lr = LinearRegression()

lr.fit(X_train,y_train)

preds = lr.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Linear Regression results: {mae} and {mape}')

Linear Regression results: 18.982645153893664 and 0.4165136758860566


In [26]:
# KNN Regression

knn = KNeighborsRegressor()

knn.fit(X_train,y_train)

preds = knn.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'KNN Regression results: {mae} and {mape}')

KNN Regression results: 20.184707120308666 and 0.43658771341581043


In [27]:
# SVM 

svm = SVR()

svm.fit(X_train,y_train)

preds = svm.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'SVM Regression results: {mae} and {mape}')

SVM Regression results: 19.471358320448825 and 0.40159115664689876


In [28]:
#Random Forest

rf = RandomForestRegressor()

rf.fit(X_train,y_train)

preds = rf.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Random Forest results: {mae} and {mape}')

Random Forest results: 17.283917923535604 and 0.3576671106666772


In [29]:
#Gradient Boosting

gb = GradientBoostingRegressor()

gb.fit(X_train,y_train)

preds = gb.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Gradient Boosting results: {mae} and {mape}')

Gradient Boosting results: 17.72976536611729 and 0.3733739096149616


In [30]:
#Ada Boosting

ada = GradientBoostingRegressor()

ada.fit(X_train,y_train)

preds = ada.predict(X_test)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'ADA results: {mae} and {mape}')

ADA results: 17.730195847496187 and 0.37338657083198784


In [31]:
# Scaling

In [32]:
sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [33]:
# SVM in scaled data

svm2 = SVR(kernel='linear' )

svm2.fit(X_train_sc,y_train)

preds = svm2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'SVM Regression results: {mae} and {mape}')

SVM Regression results: 18.411189565588714 and 0.3615855331352892


In [34]:
#Random Forest in scaled data

rf2 = RandomForestRegressor()

rf2.fit(X_train_sc,y_train)

preds = rf2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Random Forest results: {mae} and {mape}')

Random Forest results: 17.183931953700455 and 0.35605532234484744


In [35]:
#Linear Regression in scaled data

lr2 = LinearRegression()

lr2.fit(X_train_sc,y_train)

preds = lr2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Linear Regression results: {mae} and {mape}')

Linear Regression results: 18.983368009312127 and 0.41656513235100295


In [36]:
#Gradient Boosting in scaled data

gb2 = GradientBoostingRegressor()

gb2.fit(X_train_sc,y_train)

preds = gb2.predict(X_test_sc)

mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds)

print(f'Gradient Boosting results: {mae} and {mape}')

Gradient Boosting results: 17.72461429832971 and 0.3733002757541409


Testing if cross validation can improve the model even more

In [37]:
"""Grid Search for cross validation"""

'Grid Search for cross validation'

In [38]:
# Gradient Boosting Grid Search
'''
GBR=GradientBoostingRegressor()
gridGBR={'n_estimators':[500,1000,2000],'learning_rate':[.001,0.01,.1],'max_depth':[1,2,4],'subsample':[.5,.75,1],'random_state':[1]}
gridGBR=GridSearchCV(estimator=GBR,param_grid=search_grid,scoring='neg_mean_squared_error',n_jobs=1,cv=10,verbose=2)
gridGBR.fit(X_train_sc,y_train)
gridGBR.best_params_
'''

"\nGBR=GradientBoostingRegressor()\ngridGBR={'n_estimators':[500,1000,2000],'learning_rate':[.001,0.01,.1],'max_depth':[1,2,4],'subsample':[.5,.75,1],'random_state':[1]}\ngridGBR=GridSearchCV(estimator=GBR,param_grid=search_grid,scoring='neg_mean_squared_error',n_jobs=1,cv=10,verbose=2)\ngridGBR.fit(X_train_sc,y_train)\ngridGBR.best_params_\n"

In [39]:
# print(gridGBR.best_params_)
# Best parametres:
# {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 1000, 'random_state': 1, 'subsample': 0.5}

In [40]:
# preds = gridGBR.predict(X_test_sc)

# mae = mean_absolute_error(y_test, preds)
# mape = mean_absolute_percentage_error(y_test, preds)

# print(f'Gradient Boosting results: {mae} and {mape}')

Best model appears to be Random Forest

In [41]:
# Save the best model

filename = 'data/best_model.sav'
joblib.dump(rf, filename)

['data/best_model.sav']