In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

TEST_SIZE = 0.25
RANDOM_STATE = 42

# ['Asheville', 'Austin', 'Denver', 'Nashville', 'WashingtonDC']
CITIES_TO_MODEL = ['Nashville']  

In [None]:
city_data_dict = {
    'Nashville': {
        'filepath': '../../data/',
        'input_filename': 'raw/AirBnB/Samples/train_test_data_nashville.csv',
        'input_filename_synthetic': 'raw/AirBnB/Samples/CopulaGAN_SDV.csv'
        },
}

In [None]:
def get_train_test_data(cities, use_synthetic=False):
    train_test_df = None
    for city in cities:
        path_to_use = city_data_dict[city]['filepath']
        input_filename = city_data_dict[city]['input_filename']
        if city_data_dict[city]['input_filename_synthetic'] != '' and use_synthetic == True:
            input_filename = city_data_dict[city]['input_filename_synthetic']
        city_df = pd.read_csv(path_to_use+input_filename, index_col=0)
        city_df = city_df.loc[city_df['price'] <= 1500, :]
        city_df = city_df.dropna()
        print(city_df.shape)
        if train_test_df is None:
            train_test_df = city_df
        else:
            train_test_df = pd.concat([train_test_df, city_df], axis=0)

    return train_test_df

In [None]:
train_test_df = get_train_test_data(CITIES_TO_MODEL, use_synthetic=False)
print(train_test_df.shape)
train_test_df.head()

(6738, 68)
(6738, 68)


Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_within_2_4,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price
72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,...,1,3,0,0,0,0,0,0,0,104.616438
431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,351.986301
329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,127.887671
1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,...,4,8,0,0,0,0,1,2,0,133.876712
632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,...,2,3,0,0,0,0,0,0,0,163.739726


In [None]:
train_test_df.columns

Index(['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'Free Parking',
       'Kitchen Appliances', 'Patio or Balcony', 'Kitchen', 'Hair Dryer',
       'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
       'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
       'Self Check-in', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
       'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'View', 'active_within_0_2', 'arts_within_0_2',
       'food_within_0_2', 'nightlife_within_0_2', 

In [None]:
features_list = list(train_test_df.columns)
features_list.remove('price')
model_features_df = train_test_df[features_list]
avg_price = train_test_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_features_df, avg_price, 
                                                    test_size = TEST_SIZE, random_state=RANDOM_STATE)

In [None]:
model = DecisionTreeRegressor(random_state=RANDOM_STATE)

params_dict = {'max_depth': range(1, 6),
                'min_samples_leaf': [1, 2, 5, 10, 20, 35, 50],
                'min_samples_split': [2, 5, 10, 15, 25, 35, 50]
            }

gs = GridSearchCV(model,
                  param_grid = params_dict,
                  cv=5,
                  n_jobs=1,
                  scoring='r2')

gs.fit(X_train, y_train)

print(gs.best_params_)
print(gs.best_score_)

{'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 35}
0.5928256810505207


In [None]:
# Create decision tree regression object
dt_regr = DecisionTreeRegressor(max_depth=gs.best_params_['max_depth'],
                                min_samples_leaf=gs.best_params_['min_samples_leaf'],
                                min_samples_split=gs.best_params_['min_samples_split'],
                                random_state=RANDOM_STATE)

# Train the model using the training sets
dt_regr.fit(X_train, y_train)

# Make predictions using the testing set
dt_regr_pred_train = pd.Series(dt_regr.predict(X_train))
dt_regr_pred_test = pd.Series(dt_regr.predict(X_test))
dt_regr_pred_full = pd.Series(dt_regr.predict(model_features_df), name='pred_price_dt_regr', index=model_features_df.index)

# feature importances
feature_importances_dict = {a: b for a, b in zip(features_list, dt_regr.feature_importances_)}
print('feature importances')
for k, v in feature_importances_dict.items():
    print(k,':', round(v,4))

print('train r2 score:', dt_regr.score(X_train, y_train))
print('test r2 score:', dt_regr.score(X_test, y_test))
print('full r2 score:', dt_regr.score(model_features_df, avg_price))

print('train RMSE:', np.sqrt(mean_squared_error(y_train, dt_regr_pred_train)))
print('test RMSE:', np.sqrt(mean_squared_error(y_test, dt_regr_pred_test)))
print('full RMSE:', np.sqrt(mean_squared_error(avg_price, dt_regr_pred_full)))

feature importances
accommodates : 0.6692
num_bathrooms : 0.1495
bedrooms : 0.0343
beds : 0.0
minimum_nights_avg_ntm : 0.0176
maximum_nights_avg_ntm : 0.0
availability_ind : 0.0
host_is_superhost_ind : 0.0026
latitude : 0.0
longitude : 0.0148
Free Parking : 0.0
Kitchen Appliances : 0.0
Patio or Balcony : 0.0
Kitchen : 0.0
Hair Dryer : 0.0
Long Term Stays Allowed : 0.0
Toiletries : 0.0
Kitchen Essentials : 0.0
Hot Water : 0.0
Fire Extinguisher : 0.0
Carbon Monoxide Alarm : 0.0
Bed Linens : 0.0
Self Check-in : 0.0
Private Entrance : 0.0
First Aid Kit : 0.0
Extra Pillows and Blankets : 0.0
Dedicated Workspace : 0.0
Surveillance : 0.0
Backyard : 0.0
Clothing Storage : 0.0
Wine Glasses : 0.0
Cleaning Products : 0.0
Keypad : 0.0022
BBQ : 0.0
Shades : 0.0
Luggage Dropoff Allowed : 0.0
Smart Lock : 0.0
Pets Allowed : 0.0
Baby Essentials : 0.0
Pool : 0.0
Gym : 0.0
Fire Pit : 0.0
Elevator : 0.0102
Lockbox : 0.0
Fireplace : 0.0
Paid Parking : 0.0048
Laundromat Nearby : 0.0
Exercise Equipment : 0.

In [None]:
avg_price.name = 'price'
avg_price.index = model_features_df.index
df_model_preds = pd.concat([model_features_df, pd.DataFrame(avg_price), pd.DataFrame(dt_regr_pred_full)], axis=1)
df_model_preds.head()

Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price,pred_price_dt_regr
72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,...,3,0,0,0,0,0,0,0,104.616438,165.784555
431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,...,3,0,0,0,0,0,0,0,351.986301,254.237453
329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,...,3,0,0,0,0,0,0,0,127.887671,165.784555
1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,...,8,0,0,0,0,1,2,0,133.876712,250.328957
632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,...,3,0,0,0,0,0,0,0,163.739726,165.784555


In [None]:
data_list = [model_features_df, avg_price, dt_regr_pred_full]
for d in data_list:
    print(d.shape, list(d.index)[:5])

(6738, 67) [72906, 431258, 329997, 1885504, 632636]
(6738,) [72906, 431258, 329997, 1885504, 632636]
(6738,) [72906, 431258, 329997, 1885504, 632636]


In [None]:
print(df_model_preds.shape)

(6738, 69)


In [None]:
df_model_preds['aepr_dt_regr'] = df_model_preds['price'] / df_model_preds['pred_price_dt_regr']


In [None]:
df_1 = _deepnote_execute_sql('select \n    min(price) as min_price\n    ,max(price) as max_price\n    ,avg(price) as avg_price\n    ,min(pred_price_dt_regr) as min_price_pred\n    ,max(pred_price_dt_regr) as max_price_pred\n    ,avg(pred_price_dt_regr) as avg_price_pred\n    ,min(price-pred_price_dt_regr) as min_price_diff\n    ,max(price-pred_price_dt_regr) as max_price_diff\n    ,avg(price-pred_price_dt_regr) as avg_price_diff\n    ,min(aepr_dt_regr) as min_ratio_dt_regr\n    ,max(aepr_dt_regr) as max_ratio_dt_regr\n    ,avg(aepr_dt_regr) as avg_ratio_dt_regr\nfrom df_model_preds;', 'SQL_DEEPNOTE_DATAFRAME_SQL')
df_1

Unnamed: 0,min_price,max_price,avg_price,min_price_pred,max_price_pred,avg_price_pred,min_price_diff,max_price_diff,avg_price_diff,min_ratio_dt_regr,max_ratio_dt_regr,avg_ratio_dt_regr
0,10.0,1500.0,341.368923,108.478611,1230.871624,341.787629,-806.871624,1054.629947,-0.418707,0.045726,7.263532,1.00087


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb0b277f-d226-41e6-8798-2eb04c8159dd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>