# Price Prediction Modeling

### Import dependencies

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

import plotly.express as px

USE_SYNTHETIC_DATA = False 
TEST_SIZE = 0.25
RANDOM_STATE = 42
GRID_SEARCH_SCORING = 'neg_mean_squared_error'   # 'neg_root_mean_squared_error', 'r2'

### Specify list of cities to model

In [None]:
# ['Asheville', 'Austin', 'Denver', 'Nashville', 'WashingtonDC']
CITIES_TO_MODEL = ['Nashville']  

In [None]:
city_data_dict = {
    'Nashville': {
        'filepath': '../../data/',
        'input_filename': 'processed/train_test_data_nashville.csv',
        'input_filename_synthetic': 'processed/CTGAN_SDV.csv',
        'preds_filename': 'processed/model_preds_nashville.csv',
        'feature_importances_filename': 'processed/feature_importances_nashville.csv'
        },
}

### Read and examine the data

In [None]:
def get_train_test_data(cities, use_synthetic=False):
    train_test_df = None
    for city in cities:
        path_to_use = city_data_dict[city]['filepath']
        input_filename = city_data_dict[city]['input_filename']
        if city_data_dict[city]['input_filename_synthetic'] != '' and use_synthetic == True:
            input_filename = city_data_dict[city]['input_filename_synthetic']
        print(input_filename)
        city_df = pd.read_csv(path_to_use+input_filename, index_col=0)
        city_df = city_df.loc[city_df['price'] <= 1500, :]
        city_df = city_df.dropna()
        if train_test_df is None:
            train_test_df = city_df
        else:
            train_test_df = pd.concat([train_test_df, city_df], axis=0)

    return train_test_df

In [None]:
train_test_df = get_train_test_data(CITIES_TO_MODEL, use_synthetic=USE_SYNTHETIC_DATA)
print(train_test_df.shape)
train_test_df.head()

processed/train_test_data_nashville.csv
(6738, 68)


Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_within_2_4,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price
72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,...,1,3,0,0,0,0,0,0,0,104.616438
431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,351.986301
329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,127.887671
1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,...,4,8,0,0,0,0,1,2,0,133.876712
632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,...,2,3,0,0,0,0,0,0,0,163.739726


In [None]:
train_test_df.columns

Index(['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'Free Parking',
       'Kitchen Appliances', 'Patio or Balcony', 'Kitchen', 'Hair Dryer',
       'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
       'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
       'Self Check-in', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
       'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'View', 'active_within_0_2', 'arts_within_0_2',
       'food_within_0_2', 'nightlife_within_0_2', 

### Split the data into training and test files

In [None]:
features_list = list(train_test_df.columns)
features_list.remove('price')
model_features_df = train_test_df[features_list]
avg_price = train_test_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_features_df, avg_price, 
                                                    test_size = TEST_SIZE, random_state=RANDOM_STATE)

### Perform a Grid Search to identify optimal parameters

In [None]:
def perform_grid_search(params, cv_folds=5, scoring='neg_mean_squared_error'):
    model = GradientBoostingRegressor(random_state=RANDOM_STATE)

    gs = GridSearchCV(model,
                    param_grid = params ,
                    cv=cv_folds,
                    n_jobs=1,
                    scoring=scoring)
    gs.fit(X_train, y_train)
    return gs

### Test an initial set of parameters

In [None]:

params_dict = {'n_estimators': [1000],
                                'max_depth': range(2, 4),
                                'min_samples_split': [10, 25],
                                'min_samples_leaf': [10, 25],
                                'max_features': [0.1, 0.25]
            }

gs = perform_grid_search(params_dict)

print(gs.best_params_)
print(gs.best_score_)
#print(gs.cv_results_)

{'max_depth': 3, 'max_features': 0.1, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 1000}
-13647.321575879261


### Test a smaller set of parameters across a set of folds

In [None]:
def fit_and_score_model(params):
    results = dict()

    # Create gradient boosting regression object
    gb_regr = GradientBoostingRegressor(n_estimators=params['n_estimators'],
                                    max_depth=params['max_depth'],
                                    min_samples_split=params['min_samples_split'],
                                    min_samples_leaf=params['min_samples_leaf'],
                                    max_features=params['max_features'],
                                    random_state=RANDOM_STATE)

    # Train the model using the training sets
    gb_regr.fit(X_train, y_train)

    # Make predictions using the testing set
    gb_regr_pred_train = pd.Series(gb_regr.predict(X_train))
    gb_regr_pred_test = pd.Series(gb_regr.predict(X_test))
    gb_regr_pred_full = pd.Series(gb_regr.predict(model_features_df), name='pred_price_gb_regr', index=model_features_df.index)

    # feature importances
    feature_importances_dict = {a: b for a, b in zip(features_list, gb_regr.feature_importances_)}
    #print('feature importances')
    #for k, v in feature_importances_dict.items():
    #    print(k,':', round(v,4))

    results['feature_importances_dict'] = feature_importances_dict
    if GRID_SEARCH_SCORING == 'r2':
        results['train_score'] = gb_regr.score(X_train, y_train)
        results['test_score'] = gb_regr.score(X_test, y_test)
        results['full_score'] = gb_regr.score(model_features_df, avg_price)
    else: 
        results['train_score'] = mean_squared_error(y_train, gb_regr_pred_train, squared=False)
        results['test_score'] = mean_squared_error(y_test, gb_regr_pred_test, squared=False)
        results['full_score'] = mean_squared_error(avg_price, gb_regr_pred_full, squared=False)

    return results 

In [None]:
params_dict = {'n_estimators': [1000],
                'max_depth': range(2, 3),
                'min_samples_split': [10, 25],
                'min_samples_leaf': [10],
                'max_features': [0.1, 0.25]
            }

for folds in [3, 4, 5, 10]:
    gs = perform_grid_search(params_dict, cv_folds=folds, scoring=GRID_SEARCH_SCORING)
    model_results = fit_and_score_model(gs.best_params_)
    print('num folds:', folds)
    print('best params:', gs.best_params_)
    print('best score:', gs.best_score_)
    print('train score:', model_results['train_score'])
    print('test score:', model_results['test_score'])
    print('full score:', model_results['full_score'])

num folds: 3
best params: {'max_depth': 2, 'max_features': 0.1, 'min_samples_leaf': 10, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -15090.59171417339
train score: 108.11391045641403
test score: 114.74109410552852
full score: 109.8087086240394
num folds: 4
best params: {'max_depth': 2, 'max_features': 0.1, 'min_samples_leaf': 10, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14939.306503608817
train score: 108.11391045641403
test score: 114.74109410552852
full score: 109.8087086240394
num folds: 5
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 10, 'min_samples_split': 10, 'n_estimators': 1000}
best score: -14829.179613723783
train score: 104.02948969998916
test score: 113.72338962560458
full score: 106.53642579872722
num folds: 10
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 10, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14665.789679796851
train score: 103.63864448768798
test score: 113.45

### Set final parameters and check for model robustness w.r.t. number of folds

In [None]:
params_dict_final = {'n_estimators': [1000],
                    'max_depth': [2],
                    'min_samples_split': [25],
                    'min_samples_leaf': [25],
                    'max_features': [0.25]
                }

In [None]:
for folds in range(3, 11):
    gs = perform_grid_search(params_dict_final, cv_folds=folds, scoring=GRID_SEARCH_SCORING)
    model_results = fit_and_score_model(gs.best_params_)
    print('num folds:', folds)
    print('best params:', gs.best_params_)
    print('best score:', round(gs.best_score_, 3))

num folds: 3
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -15226.873
num folds: 4
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -15121.007
num folds: 5
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14914.034
num folds: 6
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14856.865
num folds: 7
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14880.137
num folds: 8
best params: {'max_depth': 2, 'max_features': 0.25, 'min_samples_leaf': 25, 'min_samples_split': 25, 'n_estimators': 1000}
best score: -14879.772
num folds: 9
best params: {'max_depth': 2, 'ma

### Fit a model using the final parameters

In [None]:
# Create gradient boosting regression object
gb_regr = GradientBoostingRegressor(n_estimators = params_dict_final['n_estimators'][0],
                                max_depth = params_dict_final['max_depth'][0],
                                min_samples_split = params_dict_final['min_samples_split'][0],
                                min_samples_leaf = params_dict_final['min_samples_leaf'][0],
                                max_features = params_dict_final['max_features'][0],
                                random_state=RANDOM_STATE)

# Train the model using the training sets
gb_regr.fit(X_train, y_train)

# Make predictions using the testing set
gb_regr_pred_train = pd.Series(gb_regr.predict(X_train), name='pred_price_gb_regr', index=X_train.index)
gb_regr_pred_test = pd.Series(gb_regr.predict(X_test), name='pred_price_gb_regr', index=X_test.index)
gb_regr_pred_full = pd.Series(gb_regr.predict(model_features_df), name='pred_price_gb_regr', index=model_features_df.index)
y_naive = np.zeros_like(y_test) + np.mean(y_test)

# feature importances
feature_importances_dict = {a: b for a, b in zip(features_list, gb_regr.feature_importances_)}
print('feature importances')
for k, v in feature_importances_dict.items():
    print(k,':', round(v,4))

print('train r2 score:', gb_regr.score(X_train, y_train))
print('test r2 score:', gb_regr.score(X_test, y_test))
print('full r2 score:', gb_regr.score(model_features_df, avg_price))

print('train RMSE:', mean_squared_error(y_train, gb_regr_pred_train, squared=False))
print('test RMSE:', mean_squared_error(y_test, gb_regr_pred_test, squared=False))
print('full RMSE:', mean_squared_error(avg_price, gb_regr_pred_full, squared=False))
print('naive RMSE:', mean_squared_error(y_test, y_naive, squared=False))


feature importances
accommodates : 0.1761
num_bathrooms : 0.2848
bedrooms : 0.174
beds : 0.0682
minimum_nights_avg_ntm : 0.0519
maximum_nights_avg_ntm : 0.0171
availability_ind : 0.0001
host_is_superhost_ind : 0.0097
latitude : 0.0085
longitude : 0.0578
Free Parking : 0.0033
Kitchen Appliances : 0.0009
Patio or Balcony : 0.0067
Kitchen : 0.0
Hair Dryer : 0.0004
Long Term Stays Allowed : 0.0
Toiletries : 0.0001
Kitchen Essentials : 0.0002
Hot Water : 0.0012
Fire Extinguisher : 0.0018
Carbon Monoxide Alarm : 0.001
Bed Linens : 0.0004
Self Check-in : 0.0004
Private Entrance : 0.0036
First Aid Kit : 0.0015
Extra Pillows and Blankets : 0.0014
Dedicated Workspace : 0.0009
Surveillance : 0.0011
Backyard : 0.0014
Clothing Storage : 0.0007
Wine Glasses : 0.0001
Cleaning Products : 0.0004
Keypad : 0.0034
BBQ : 0.0016
Shades : 0.0014
Luggage Dropoff Allowed : 0.0007
Smart Lock : 0.0011
Pets Allowed : 0.0001
Baby Essentials : 0.0003
Pool : 0.0036
Gym : 0.001
Fire Pit : 0.0011
Elevator : 0.0044
Loc

### Create a DataFrame with the target and predicted values

In [None]:
y_test.name = 'price'
y_test.index = X_test.index
df_model_preds = pd.concat([X_test, pd.DataFrame(y_test), pd.DataFrame(gb_regr_pred_test)], axis=1)
df_model_preds.head()

Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price,pred_price_gb_regr
9090089,6,1.0,3.0,3.0,1.0,1125.0,1,1,36.12625,-86.7961,...,3,0,0,0,0,0,0,0,180.427397,278.365002
42009956,5,1.0,2.0,3.0,1.9,1125.0,1,0,36.16581,-86.74321,...,6,0,0,0,0,0,0,0,119.021918,193.556162
45556783,2,1.0,1.0,1.0,30.0,1125.0,1,0,36.16996,-86.75298,...,4,0,0,0,0,0,0,0,70.0,44.565634
36386015,2,1.0,1.0,1.0,2.0,14.0,1,0,36.10966,-86.74022,...,8,0,0,0,0,1,2,0,64.69589,99.320921
37936461,8,1.0,2.0,4.0,1.0,3.0,1,1,36.13692,-86.85645,...,3,0,0,0,0,1,6,0,159.076712,237.325286


### Add the Actual-to-Expected Price Ratio to the DataFrame

In [None]:
df_model_preds['aepr_gb_regr'] = df_model_preds['price'] / df_model_preds['pred_price_gb_regr']
df_model_preds['aepr_gb_regr'] = np.where(df_model_preds['aepr_gb_regr']>5,5,df_model_preds['aepr_gb_regr'])
df_model_preds['aepr_gb_regr'] = np.where(df_model_preds['aepr_gb_regr']<0,0,df_model_preds['aepr_gb_regr'])

In [None]:
print(df_model_preds.shape)

(1685, 70)


In [None]:
df_model_preds.columns

Index(['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'Free Parking',
       'Kitchen Appliances', 'Patio or Balcony', 'Kitchen', 'Hair Dryer',
       'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
       'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
       'Self Check-in', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
       'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'View', 'active_within_0_2', 'arts_within_0_2',
       'food_within_0_2', 'nightlife_within_0_2', 

### Output the final predictions and feature importance .csv files

In [None]:
city = CITIES_TO_MODEL[0]
path_to_use = city_data_dict[city]['filepath']
preds_filename = city_data_dict[city]['preds_filename']
df_model_preds.to_csv(path_to_use+preds_filename)

In [None]:
feature_importances_df = pd.DataFrame({'importance': gb_regr.feature_importances_}, index=features_list)
feature_importances_filename = city_data_dict[city]['feature_importances_filename']
feature_importances_df.to_csv(path_to_use+feature_importances_filename)

In [None]:
feature_importances_df.head()

Unnamed: 0,importance
accommodates,0.176089
num_bathrooms,0.284808
bedrooms,0.174011
beds,0.06816
minimum_nights_avg_ntm,0.051854


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb0b277f-d226-41e6-8798-2eb04c8159dd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>