In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import xgboost
from xgboost import plot_importance
from collections import OrderedDict
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt  
from sklearn.model_selection import GridSearchCV

In [2]:
df= pd.read_csv('./CSVs/Modelling.csv')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option("display.max_rows",None)

In [4]:
df.head()

Unnamed: 0,maximum_nights,extra_people,guests_included,beds,availability_365,accommodates,bedrooms,host_listings_count,host_for,number_of_reviews_ltm,minimum_nights,bathrooms,secure,cleaning_fee,room_type_private_room,host_acceptance_rate,elevator,white_goods,host_response_rate,security_deposit,government,review_scores_value,area_name_salamanca,area_name_san_blas,price
0,365,15.0,2,0.0,77,2,1.0,1.0,4010,14,1,1.0,0.0,1.0,1,3.0,1.0,1.0,4.0,0.0,1.0,10.0,0,0,70.0
1,40,8.0,1,1.0,200,1,1.0,2.0,3699,0,4,1.0,0.0,0.0,1,4.0,1.0,1.0,4.0,0.0,1.0,9.0,0,0,17.0
2,730,10.0,1,5.0,249,6,3.0,9.0,3703,0,15,2.0,0.0,1.0,0,3.0,1.0,1.0,4.0,1.0,1.0,9.0,0,0,50.0
3,730,10.0,2,1.0,364,3,0.0,1.0,3661,7,4,1.0,0.0,1.0,0,3.0,1.0,1.0,4.0,1.0,1.0,10.0,0,0,89.0
4,10,0.0,4,3.0,322,4,2.0,1.0,3661,16,3,1.5,0.0,0.0,0,4.0,1.0,1.0,4.0,1.0,0.0,10.0,0,0,115.0


# Train Test Split

In [5]:
splitter = train_test_split
"-----------------------"

df_train, df_test = splitter(df, test_size = 0.3, random_state = 42)
print("Dataset shape: {shape}".format(shape = df_train.shape))
print("Dataset shape: {shape}".format(shape = df_test.shape))

Dataset shape: (15283, 25)
Dataset shape: (6550, 25)


In [6]:
target = 'price'

X_train= df_train.drop('price',axis=1)
y_train = df_train[target]

X_test= df_test.drop('price',axis=1)
y_test = df_test[target]

# Standardizing and Normalizing

In [7]:
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.fit_transform(X_test)
X_train = pd.DataFrame(X_train_sc, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(X_test_sc, index=X_test.index, columns=X_test.columns)

In [8]:
scaler = MinMaxScaler()
X_trains = scaler.fit_transform(X_train_sc)
X_tests = scaler.fit_transform(X_test_sc)
X_train = pd.DataFrame(X_trains, index=X_train.index, columns=X_train.columns)
X_test = pd.DataFrame(X_tests, index=X_test.index, columns=X_test.columns)

In [9]:
X_train.head()

Unnamed: 0,maximum_nights,extra_people,guests_included,beds,availability_365,accommodates,bedrooms,host_listings_count,host_for,number_of_reviews_ltm,minimum_nights,bathrooms,secure,cleaning_fee,room_type_private_room,host_acceptance_rate,elevator,white_goods,host_response_rate,security_deposit,government,review_scores_value,area_name_salamanca,area_name_san_blas
3693,1.0116e-05,0.0,0.0,0.02,0.0,0.052632,0.111111,0.001898,0.331536,0.0,0.003559,0.090909,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.875,0.0,0.0
6385,1.0116e-05,0.0,0.0,0.02,0.715068,0.0,0.111111,0.009488,0.478314,0.039877,0.024021,0.090909,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.875,0.0,0.0
4029,1.26e-07,0.0,0.0,0.04,0.0,0.157895,0.111111,0.001898,0.635874,0.0,0.003559,0.090909,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875,1.0,0.0
7220,8.01e-07,0.055762,0.066667,0.04,0.873973,0.263158,0.222222,0.091082,0.520706,0.079755,0.001779,0.090909,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
3995,1.8e-07,0.037175,0.0,0.02,0.158904,0.052632,0.111111,0.001898,0.32394,0.015337,0.001779,0.090909,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.875,0.0,0.0


# XGBoost


In [11]:
def mean_absolute_percentage_error(y_true,y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return (np.sum(np.abs(y_true-y_pred)/y_true)/len(y_true))*100

In [11]:
xgb_model = xgboost.XGBRegressor()

parameters = {
    'max_depth': range(5,17),
    'min_child_weight': range(3,20),
    'n_estimators': range(7,20)}
randm = RandomizedSearchCV(estimator=xgb_model, param_distributions = parameters, 
                               cv = 4, n_iter = 500,scoring='neg_mean_absolute_error')
randm.fit(X_train, y_train)

    
print("\n The best estimator across ALL searched params:\n",randm.best_estimator_)
    
print("\n The best score across ALL searched params:\n",randm.best_score_)
    
print("\n The best parameters across ALL searched params:\n",randm.best_params_)


 The best estimator across ALL searched params:
 XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=15,
             min_child_weight=3, missing=nan, monotone_constraints=None,
             n_estimators=8, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

 The best score across ALL searched params:
 -75.61930009505241

 The best parameters across ALL searched params:
 {'n_estimators': 8, 'min_child_weight': 3, 'max_depth': 15}


In [12]:
xgb_model = xgboost.XGBRegressor()
parameters = {
'max_depth': range(12,17),
'min_child_weight': range(16,22),
'n_estimators': range(20,26)}

clf = GridSearchCV(estimator=xgb_model, param_grid=parameters, n_jobs=5, cv=5, scoring='neg_mean_absolute_error')
clf.fit(X_train, y_train)
    
print("\n The best estimator across ALL searched params:\n",clf.best_estimator_)
    
print("\n The best score across ALL searched params:\n",clf.best_score_)
    
print("\n The best parameters across ALL searched params:\n",clf.best_params_)


 The best estimator across ALL searched params:
 XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.300000012, max_delta_step=0, max_depth=12,
             min_child_weight=18, missing=nan, monotone_constraints=None,
             n_estimators=20, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
             validate_parameters=False, verbosity=None)

 The best score across ALL searched params:
 -84.64163870681526

 The best parameters across ALL searched params:
 {'max_depth': 12, 'min_child_weight': 18, 'n_estimators': 20}


In [12]:
model = xgboost.XGBRegressor(max_depth=12, min_child_weight= 18, n_estimators= 20)
#'max_depth': 14, 'min_child_weight': 18, 'n_estimators': 22
#max_depth=12, min_child_weight= 18, n_estimators= 20
#n_estimators= 19, min_child_weight= 10, max_depth= 14
#{'n_estimators': 19, 'min_child_weight': 18, 'max_depth': 15}
#{'n_estimators': 21, 'min_child_weight': 3, 'max_depth': 8}

model.fit(X_train,y_train)

predT = model.predict(X_train)
pred = model.predict(X_test)


print('TRAIN',mean_absolute_percentage_error(y_train, predT))
print('TEST',mean_absolute_percentage_error(y_test, pred))

TRAIN 75.18915282364763
TEST 89.668037996871


Camila Barbagallo, Ryan Daher, Paula García, Rocío González Lantero