In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [2]:
df= pd.read_csv('./CSVs/Modelling.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",None)

In [4]:
df.head()

Unnamed: 0,maximum_nights,extra_people,guests_included,beds,availability_365,accommodates,bedrooms,host_listings_count,host_for,number_of_reviews_ltm,minimum_nights,bathrooms,secure,cleaning_fee,room_type_private_room,host_acceptance_rate,elevator,white_goods,host_response_rate,security_deposit,government,review_scores_value,area_name_salamanca,area_name_san_blas,price
0,365,15.0,2,0.0,77,2,1.0,1.0,4010,14,1,1.0,0.0,1.0,1,3.0,1.0,1.0,4.0,0.0,1.0,10.0,0,0,70.0
1,40,8.0,1,1.0,200,1,1.0,2.0,3699,0,4,1.0,0.0,0.0,1,4.0,1.0,1.0,4.0,0.0,1.0,9.0,0,0,17.0
2,730,10.0,1,5.0,249,6,3.0,9.0,3703,0,15,2.0,0.0,1.0,0,3.0,1.0,1.0,4.0,1.0,1.0,9.0,0,0,50.0
3,730,10.0,2,1.0,364,3,0.0,1.0,3661,7,4,1.0,0.0,1.0,0,3.0,1.0,1.0,4.0,1.0,1.0,10.0,0,0,89.0
4,10,0.0,4,3.0,322,4,2.0,1.0,3661,16,3,1.5,0.0,0.0,0,4.0,1.0,1.0,4.0,1.0,0.0,10.0,0,0,115.0


In [5]:
df.dtypes

maximum_nights              int64
extra_people              float64
guests_included             int64
beds                      float64
availability_365            int64
accommodates                int64
bedrooms                  float64
host_listings_count       float64
host_for                    int64
number_of_reviews_ltm       int64
minimum_nights              int64
bathrooms                 float64
secure                    float64
cleaning_fee              float64
room_type_private_room      int64
host_acceptance_rate      float64
elevator                  float64
white_goods               float64
host_response_rate        float64
security_deposit          float64
government                float64
review_scores_value       float64
area_name_salamanca         int64
area_name_san_blas          int64
price                     float64
dtype: object

# Train Test Split

In [6]:
splitter = train_test_split
"-----------------------"

df_train, df_test = splitter(df, test_size = 0.3, random_state = 42)
print("Dataset shape: {shape}".format(shape = df_train.shape))
print("Dataset shape: {shape}".format(shape = df_test.shape))

Dataset shape: (15283, 25)
Dataset shape: (6550, 25)


In [7]:
target = 'price'

X_train= df_train.drop('price',axis=1)
y_train = df_train[target]

X_test= df_test.drop('price',axis=1)
y_test = df_test[target]

# Standardizing and Scaling

In [8]:
scaler_train = StandardScaler()
scaler_test = StandardScaler()
X_train_sc = scaler_train.fit_transform(X_train)
X_test_sc = scaler_test.fit_transform(X_test)
X_train = pd.DataFrame(X_train_sc, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(X_test_sc, index=X_test.index, columns=X_test.columns)

In [9]:
min_scaler_train = MinMaxScaler()
min_scaler_test = MinMaxScaler()
X_trains = min_scaler_train.fit_transform(X_train_sc)
X_tests = min_scaler_test.fit_transform(X_test_sc)
X_train = pd.DataFrame(X_trains, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(X_tests, index=X_test.index, columns=X_test.columns)

In [10]:
X_train.head()

Unnamed: 0,maximum_nights,extra_people,guests_included,beds,availability_365,accommodates,bedrooms,host_listings_count,host_for,number_of_reviews_ltm,minimum_nights,bathrooms,secure,cleaning_fee,room_type_private_room,host_acceptance_rate,elevator,white_goods,host_response_rate,security_deposit,government,review_scores_value,area_name_salamanca,area_name_san_blas
3693,1.0116e-05,0.0,0.0,0.02,0.0,0.052632,0.111111,0.001898,0.331536,0.0,0.003559,0.090909,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.875,0.0,0.0
6385,1.0116e-05,0.0,0.0,0.02,0.715068,0.0,0.111111,0.009488,0.478314,0.039877,0.024021,0.090909,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.875,0.0,0.0
4029,1.26e-07,0.0,0.0,0.04,0.0,0.157895,0.111111,0.001898,0.635874,0.0,0.003559,0.090909,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875,1.0,0.0
7220,8.01e-07,0.055762,0.066667,0.04,0.873973,0.263158,0.222222,0.091082,0.520706,0.079755,0.001779,0.090909,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
3995,1.8e-07,0.037175,0.0,0.02,0.158904,0.052632,0.111111,0.001898,0.32394,0.015337,0.001779,0.090909,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.875,0.0,0.0


10 components

# SVR

In [11]:
clf = SVR()
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [12]:
pred_train = clf.predict(X_train)
pred_test = clf.predict(X_test)

In [17]:
def mean_absolute_percentage_error(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return (np.sum(np.abs(y_true-y_pred)/y_true)/len(y_true))*100

In [18]:
print('MAPE TRAIN:',mean_absolute_percentage_error(y_train,pred_train) )
print('MAPE TEST:',mean_absolute_percentage_error(y_test,pred_test) )

MAPE TRAIN: 42.67382108987092
MAPE TEST: 42.50242216548482


# Fine Tuning

In [19]:
param_grid = {'C': [0.5,1,1.5],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVR(),param_grid,refit=True,verbose=2)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=0.5, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ................................ C=0.5, kernel=rbf, total=  10.5s
[CV] C=0.5, kernel=rbf ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   10.5s remaining:    0.0s


[CV] ................................ C=0.5, kernel=rbf, total=  10.5s
[CV] C=0.5, kernel=rbf ...............................................
[CV] ................................ C=0.5, kernel=rbf, total=  10.4s
[CV] C=0.5, kernel=rbf ...............................................
[CV] ................................ C=0.5, kernel=rbf, total=  10.6s
[CV] C=0.5, kernel=rbf ...............................................
[CV] ................................ C=0.5, kernel=rbf, total=  10.2s
[CV] C=0.5, kernel=poly ..............................................
[CV] ............................... C=0.5, kernel=poly, total=   9.3s
[CV] C=0.5, kernel=poly ..............................................
[CV] ............................... C=0.5, kernel=poly, total=   8.0s
[CV] C=0.5, kernel=poly ..............................................
[CV] ............................... C=0.5, kernel=poly, total=   8.0s
[CV] C=0.5, kernel=poly ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  8.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                           epsilon=0.1, gamma='scale', kernel='rbf',
                           max_iter=-1, shrinking=True, tol=0.001,
                           verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.5, 1, 1.5],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [20]:
print(grid.best_estimator_)

SVR(C=1.5, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)


In [21]:
train_pred= grid.predict(X_train)
test_pred= grid.predict(X_test)

In [22]:
print('MAPE TRAIN:',mean_absolute_percentage_error(y_train,train_pred) )
print('MAPE TEST:',mean_absolute_percentage_error(y_test,test_pred) )

MAPE TRAIN: 42.20379258095625
MAPE TEST: 42.60881391846855


The best model from both of these is the one prior to fine tuning, we will keep those parameters:

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

Camila Barbagallo, Ryan Daher, Paula García, Rocío González Lantero