### Model design

**Importing libraries and dataset**

In [1]:
# imports
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC
from catboost import CatBoostRegressor
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

dataset_path = "data/data_after_FE.csv"
dataset = pd.read_csv(dataset_path, index_col=0)

In [3]:
# printing the first 5 data rows
dataset.head()

Unnamed: 0,room_type_Private room,longitude,latitude,id,bairro_group_Manhattan,host_id,disponibilidade_365,room_type_Shared room,minimo_noites,numero_de_reviews,calculado_host_listings_count,bairro_Williamsburg,bairro_Midtown,price
0,False,0.490469,0.614199,0.0,True,1e-06,0.972603,False,0.0,0.071542,0.003067,False,True,225
1,True,0.569257,0.748252,2.9e-05,True,8e-06,1.0,False,0.001601,0.0,0.0,False,False,150
2,False,0.535649,0.448496,3.4e-05,False,9e-06,0.531507,False,0.0,0.429253,0.0,False,False,89
3,False,0.565324,0.72282,6.7e-05,True,1.7e-05,0.0,False,0.007206,0.014308,0.0,False,False,80
4,False,0.506972,0.599802,6.9e-05,True,1.8e-05,0.353425,False,0.001601,0.117647,0.0,False,False,200


In [4]:
# dataset shape
dataset.shape

(45931, 14)

In [5]:
# splitting the data
X = dataset.drop(['price'], axis=1)
Y = dataset['price']

print(X.shape)
print(Y.shape)

(45931, 13)
(45931,)


In [6]:
X.head()

Unnamed: 0,room_type_Private room,longitude,latitude,id,bairro_group_Manhattan,host_id,disponibilidade_365,room_type_Shared room,minimo_noites,numero_de_reviews,calculado_host_listings_count,bairro_Williamsburg,bairro_Midtown
0,False,0.490469,0.614199,0.0,True,1e-06,0.972603,False,0.0,0.071542,0.003067,False,True
1,True,0.569257,0.748252,2.9e-05,True,8e-06,1.0,False,0.001601,0.0,0.0,False,False
2,False,0.535649,0.448496,3.4e-05,False,9e-06,0.531507,False,0.0,0.429253,0.0,False,False
3,False,0.565324,0.72282,6.7e-05,True,1.7e-05,0.0,False,0.007206,0.014308,0.0,False,False
4,False,0.506972,0.599802,6.9e-05,True,1.8e-05,0.353425,False,0.001601,0.117647,0.0,False,False


**Model and accuracy**

- As we are dealing with a regression problem (the output values are continuous values), the algorithms that will be used are:
    - SVM-Support Vector Machine;
    - Random Forest Regressor;
    - Linear Regressor.
- To calculate the loss we will use the Mean Absolute Percentage Error.

**Linear Regressor**

In [7]:
# splitting the data
X_train ,X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=2)

In [8]:
lr_model = LinearRegression()
lr_model.fit(X_train, Y_train)

print("By using Linear Regression we got:")
print("Training data accuracy", lr_model.score(X_train, Y_train))
print("Testing data accuracy", lr_model.score(X_test, Y_test))

**Random Forest Regressor**

In [9]:
rfm = RandomForestRegressor()
rfm.fit(X_train,Y_train)

print(rfm.score(X_train, Y_train))
print(rfm.score(X_test, Y_test))

In [10]:
pred = rfm.predict(X_test)

print(mean_absolute_error(Y_test,pred))
print(metrics.mean_absolute_error(Y_test,pred))

**Support Vector Machine (SVM)**

In [11]:
# instantiate the model
model_SVR = svm.SVR()

# fit the data
model_SVR.fit(X_train, Y_train)

# generate the predictions
Y_pred = model_SVR.predict(X_test)

# calculate the Mean Absolute Error
print(mean_absolute_error(Y_test, Y_pred))
print(metrics.mean_absolute_error(Y_test, pred))

# calculate the R2 score
SVR_R2_score = r2_score(Y_test, Y_pred)
print(f"SVR R2 score: {SVR_R2_score}")

**CatBoost Classifier**

In [19]:
# instantiate the model
cb_model = CatBoostRegressor()

# fit the data
cb_model.fit(X_train, Y_train)

# generate the predictions
preds = cb_model.predict(X_test)

Learning rate set to 0.070846
0:	learn: 65.9383882	total: 6.95ms	remaining: 6.94s
1:	learn: 63.8457396	total: 11.6ms	remaining: 5.78s
2:	learn: 61.9061656	total: 16.3ms	remaining: 5.42s
3:	learn: 60.2079095	total: 20.9ms	remaining: 5.21s
4:	learn: 58.7142247	total: 25.5ms	remaining: 5.07s
5:	learn: 57.4458163	total: 30.3ms	remaining: 5.02s
6:	learn: 56.2263958	total: 35.6ms	remaining: 5.04s
7:	learn: 55.1442595	total: 40.1ms	remaining: 4.97s
8:	learn: 54.1771477	total: 44.7ms	remaining: 4.92s
9:	learn: 53.3225309	total: 49.4ms	remaining: 4.88s
10:	learn: 52.5386600	total: 54ms	remaining: 4.85s
11:	learn: 51.8313190	total: 58.4ms	remaining: 4.81s
12:	learn: 51.2128296	total: 62.9ms	remaining: 4.77s
13:	learn: 50.7178992	total: 67.5ms	remaining: 4.75s
14:	learn: 50.2124171	total: 72ms	remaining: 4.73s
15:	learn: 49.7680970	total: 76.4ms	remaining: 4.7s
16:	learn: 49.3679057	total: 80.9ms	remaining: 4.68s
17:	learn: 49.0023623	total: 85.3ms	remaining: 4.65s
18:	learn: 48.7024773	total: 89

In [20]:
print(cb_model.get_params())

# CatBoost classifier score
cb_R2_score = r2_score(Y_test, preds)
print(f"CB R2 score: {cb_R2_score}")

{'loss_function': 'RMSE'}
CB R2 score: 0.5893839558843295


**The classifiers with the best scores are:**
1. CatBoost: 0.5893
2. Random Forest Regressor: 0.5774
3. Support Vector Machine: 0.4847

- We can apply fine-tuning in order to try to improve the performance. 
- The technique that will be used is GridSearchCV

**Applying GridSearch on CatBoostRegressor**

In [15]:
# # applying grid search
# model_CBR = CatBoostRegressor()

# parameters = {'learning_rate': [0.03, 0.1],
#               'depth': [4, 6, 10],
#               'l2_leaf_reg': [1, 3, 5, 7, 9]}

# grid = GridSearchCV(estimator=model_CBR, param_grid = parameters, cv = 2, n_jobs=-1, verbose=1)
# print(grid)

GridSearchCV(cv=2,
             estimator=<catboost.core.CatBoostRegressor object at 0x0000017A75234340>,
             n_jobs=-1,
             param_grid={'depth': [4, 6, 10], 'l2_leaf_reg': [1, 3, 5, 7, 9],
                         'learning_rate': [0.03, 0.1]},
             verbose=1)


In [16]:
# grid.fit(X_train, Y_train)

Fitting 2 folds for each of 30 candidates, totalling 60 fits
0:	learn: 67.2527835	total: 172ms	remaining: 2m 51s
1:	learn: 66.2652688	total: 194ms	remaining: 1m 36s
2:	learn: 65.3483347	total: 217ms	remaining: 1m 12s
3:	learn: 64.4691214	total: 239ms	remaining: 59.6s
4:	learn: 63.6245673	total: 262ms	remaining: 52.1s
5:	learn: 62.7860664	total: 285ms	remaining: 47.3s
6:	learn: 61.9961542	total: 308ms	remaining: 43.7s
7:	learn: 61.2614177	total: 331ms	remaining: 41s
8:	learn: 60.5464408	total: 353ms	remaining: 38.9s
9:	learn: 59.8734873	total: 375ms	remaining: 37.1s
10:	learn: 59.2006173	total: 401ms	remaining: 36s
11:	learn: 58.5679654	total: 424ms	remaining: 34.9s
12:	learn: 57.9716444	total: 446ms	remaining: 33.9s
13:	learn: 57.3998274	total: 469ms	remaining: 33s
14:	learn: 56.8499854	total: 490ms	remaining: 32.2s
15:	learn: 56.3266500	total: 512ms	remaining: 31.5s
16:	learn: 55.8038005	total: 535ms	remaining: 30.9s
17:	learn: 55.3317886	total: 558ms	remaining: 30.4s
18:	learn: 54.88

In [17]:
# print(" Results from Grid Search " )
# print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
# print("\n The best score across ALL searched params:\n", grid.best_score_)
# print("\n The best parameters across ALL searched params:\n", grid.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostRegressor object at 0x0000017A79C6DDC0>

 The best score across ALL searched params:
 0.584900402682129

 The best parameters across ALL searched params:
 {'depth': 10, 'l2_leaf_reg': 9, 'learning_rate': 0.03}


Results from Grid Search: 
- The best score across all searched params: 0.584900402682129
- The best parameters across all searched parameters:
  - 'depth': 10
  - 'l2_leaf_reg': 9
  - 'learning_rate': 0.03

In [26]:
# instantiate the model
cb_model_ = CatBoostRegressor(depth=10, l2_leaf_reg=9, learning_rate=0.03)

# fit the data
cb_model_.fit(X_train, Y_train)

# generate the predictions
preds_ = cb_model_.predict(X_test)

# CatBoost classifier score
cb_R2_score = r2_score(Y_test, preds_)

0:	learn: 67.2527835	total: 24.3ms	remaining: 24.3s
1:	learn: 66.2652688	total: 44.4ms	remaining: 22.2s
2:	learn: 65.3483347	total: 66.1ms	remaining: 22s
3:	learn: 64.4691214	total: 88.1ms	remaining: 21.9s
4:	learn: 63.6245673	total: 110ms	remaining: 21.8s
5:	learn: 62.7860664	total: 131ms	remaining: 21.8s
6:	learn: 61.9961542	total: 153ms	remaining: 21.7s
7:	learn: 61.2614177	total: 174ms	remaining: 21.6s
8:	learn: 60.5464408	total: 196ms	remaining: 21.5s
9:	learn: 59.8734873	total: 216ms	remaining: 21.4s
10:	learn: 59.2006173	total: 237ms	remaining: 21.4s
11:	learn: 58.5679654	total: 259ms	remaining: 21.3s
12:	learn: 57.9716444	total: 280ms	remaining: 21.3s
13:	learn: 57.3998274	total: 302ms	remaining: 21.3s
14:	learn: 56.8499854	total: 323ms	remaining: 21.2s
15:	learn: 56.3266500	total: 344ms	remaining: 21.2s
16:	learn: 55.8038005	total: 366ms	remaining: 21.1s
17:	learn: 55.3317886	total: 387ms	remaining: 21.1s
18:	learn: 54.8824146	total: 408ms	remaining: 21.1s
19:	learn: 54.451323

In [27]:
print(f"CB R2 score: {cb_R2_score}")

CB R2 score: 0.5907115669959188


- There was a slight increase in the score from 0.5893 to 0.5907 after hyperparameter tuning;
- This is the final model that will be used;
- Another possibility to improve the model's performance is to use a larger database.

**Saving the best model - CatBoostRegressor after hyperparameter tuning**

In [33]:
# # create and save the model in a pickle file
# with open('apt_rental_model','wb') as file:
#     pickle.dump(cb_model_, file)

In [34]:
# loading the model
with open('apt_rental_model','rb') as f:
    mod=pickle.load(f)

Testing with the data:

{'id': 2595,
'nome': 'Skylit Midtown Castle',
'host_id': 2845,
'host_name': 'Jennifer',
'bairro_group': 'Manhattan',
'bairro': 'Midtown',
'latitude': 40.75362,
'longitude': -73.98377,
'room_type': 'Entire home/apt',
'price': 225,
'minimo_noites': 1,
'numero_de_reviews': 45,
'ultima_review': '2019-05-21',
'reviews_por_mes': 0.38,
'calculado_host_listings_count': 2,
'disponibilidade_365': 355}

In [37]:
X.columns

Index(['room_type_Private room', 'longitude', 'latitude', 'id',
       'bairro_group_Manhattan', 'host_id', 'disponibilidade_365',
       'room_type_Shared room', 'minimo_noites', 'numero_de_reviews',
       'calculado_host_listings_count', 'bairro_Williamsburg',
       'bairro_Midtown'],
      dtype='object')

In [43]:
# just for reference
model_input = [['room_type_Private room', 'longitude', 'latitude', 'id',
       'bairro_group_Manhattan', 'host_id', 'disponibilidade_365',
       'room_type_Shared room', 'minimo_noites', 'numero_de_reviews',
       'calculado_host_listings_count', 'bairro_Williamsburg',
       'bairro_Midtown']]

test_input = [False, -73.98377, 40.75362, 2595, True, 2845, 355, False, 1, 45, 2, False, True]

apt_rental_price_pred = mod.predict([test_input])
print(f"Predicted price: {apt_rental_price_pred}")

Predicted price: [157.6715565]


- The predicted price was 157.67 dollars;
- The target price is 225;
- The difference between the predicted price and the target price is $67.33.