In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import mean_squared_error, r2_score

TEST_SIZE = 0.25
RANDOM_STATE = 42

# ['Asheville', 'Austin', 'Denver', 'Nashville', 'WashingtonDC']
CITIES_TO_MODEL = ['Nashville']  

In [None]:
city_data_dict = {
    'Nashville': {
        'filepath': '../../data/',
        'input_filename': 'raw/AirBnB/Samples/train_test_data_nashville.csv',
        'input_filename_synthetic': 'raw/AirBnB/Samples/CopulaGAN_SDV.csv'
        },
}

In [None]:
def get_train_test_data(cities, use_synthetic=False):
    train_test_df = None
    for city in cities:
        path_to_use = city_data_dict[city]['filepath']
        input_filename = city_data_dict[city]['input_filename']
        if city_data_dict[city]['input_filename_synthetic'] != '' and use_synthetic == True:
            input_filename = city_data_dict[city]['input_filename_synthetic']
        city_df = pd.read_csv(path_to_use+input_filename, index_col=0)
        city_df = city_df.loc[city_df['price'] <= 1500, :]
        city_df = city_df.dropna()
        print(city_df.shape)
        if train_test_df is None:
            train_test_df = city_df
        else:
            train_test_df = pd.concat([train_test_df, city_df], axis=0)

    return train_test_df

In [None]:
train_test_df = get_train_test_data(CITIES_TO_MODEL)
print(train_test_df.shape)
train_test_df.head()

(6738, 68)
(6738, 68)


Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_within_2_4,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price
72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,...,1,3,0,0,0,0,0,0,0,104.616438
431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,351.986301
329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,...,2,3,0,0,0,0,0,0,0,127.887671
1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,...,4,8,0,0,0,0,1,2,0,133.876712
632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,...,2,3,0,0,0,0,0,0,0,163.739726


In [None]:
train_test_df.columns

Index(['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'Free Parking',
       'Kitchen Appliances', 'Patio or Balcony', 'Kitchen', 'Hair Dryer',
       'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
       'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
       'Self Check-in', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
       'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'View', 'active_within_0_2', 'arts_within_0_2',
       'food_within_0_2', 'nightlife_within_0_2', 

In [None]:
features_list = list(train_test_df.columns)
features_list.remove('price')
model_features_df = train_test_df[features_list]
avg_price = train_test_df['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(model_features_df, avg_price, 
                                                    test_size = TEST_SIZE, random_state=RANDOM_STATE)

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)

regr_pred_train = regr.predict(X_train)
regr_pred_test = regr.predict(X_test)
regr_pred_full = regr.predict(model_features_df)

# Make predictions using the testing set
regr_pred_train = pd.Series(regr.predict(X_train))
regr_pred_test = pd.Series(regr.predict(X_test))
regr_pred_full = pd.Series(regr.predict(model_features_df), name='pred_price_regr', index=model_features_df.index)

print("Coefficients: \n", np.round(regr.coef_, 2))

print('train r2 score:', regr.score(X_train, y_train))
print('test r2 score:', regr.score(X_test, y_test))
print('full r2 score:', regr.score(model_features_df, avg_price))

print('train RMSE:', np.sqrt(mean_squared_error(y_train, regr_pred_train)))
print('test RMSE:', np.sqrt(mean_squared_error(y_test, regr_pred_test)))
print('full RMSE:', np.sqrt(mean_squared_error(avg_price, regr_pred_full)))

Coefficients: 
 [  22.39   62.51   12.81    1.8    -0.36    0.     -1.18   29.68  -96.53
 -133.3   -47.39  -41.41   10.85  -20.98   13.33   -7.08   13.22  -11.51
    9.61   -8.78   -9.44   21.33   30.55  -22.16   13.42    2.07   -5.39
   13.31    8.64  -15.24    0.62   -7.74  -16.21    2.33   11.43  -17.25
  -22.46   -6.59   -2.84   21.37  -39.28   10.01   38.08  -31.26   29.04
   17.85    8.02   -9.62   45.47  -16.71   -4.17   -0.84   11.31  -11.97
   -3.05   -2.02   -5.37   -2.62    2.09  -15.23  -39.78   38.91   20.74
  -12.63  -12.73   -9.72 -113.21]
train r2 score: 0.6548729119925445
test r2 score: 0.6761238991706949
full r2 score: 0.6600611328791806
train RMSE: 138.9118525138864
test RMSE: 132.4354332740648
full RMSE: 137.32091125059435


In [None]:
avg_price.name = 'price'
avg_price.index = model_features_df.index
df_model_preds = pd.concat([model_features_df, pd.DataFrame(avg_price), pd.DataFrame(regr_pred_full)], axis=1)
df_model_preds.head()

Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,restaurants_within_2_4,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price,pred_price_regr
72906,4,1.0,2.0,2.0,2.0,1125.0,1,1,36.13122,-86.80066,...,3,0,0,0,0,0,0,0,104.616438,248.082956
431258,4,2.5,2.0,2.0,2.3,1101.9,1,1,36.1758,-86.7995,...,3,0,0,0,0,0,0,0,351.986301,366.420439
329997,2,1.0,1.0,1.0,2.2,1086.4,1,1,36.1758,-86.7995,...,3,0,0,0,0,0,0,0,127.887671,210.481242
1885504,6,2.0,2.0,3.0,2.6,1125.0,1,0,36.10963,-86.74195,...,8,0,0,0,0,1,2,0,133.876712,176.722821
632636,2,1.5,1.0,1.0,2.3,60.0,1,1,36.1723,-86.7925,...,3,0,0,0,0,0,0,0,163.739726,204.435003


In [None]:
data_list = [model_features_df, avg_price, regr_pred_full]
for d in data_list:
    print(d.shape, list(d.index)[:5])

(6738, 67) [72906, 431258, 329997, 1885504, 632636]
(6738,) [72906, 431258, 329997, 1885504, 632636]
(6738,) [72906, 431258, 329997, 1885504, 632636]


In [None]:
print(df_model_preds.shape)

(6738, 69)


In [None]:
df_model_preds['aepr_regr'] = df_model_preds['price'] / df_model_preds['pred_price_regr']

In [None]:
df_1 = _deepnote_execute_sql('select \n    min(price) as min_price\n    ,max(price) as max_price\n    ,avg(price) as avg_price\n    ,min(pred_price_regr) as min_price_pred\n    ,max(pred_price_regr) as max_price_pred\n    ,avg(pred_price_regr) as avg_price_pred\n    ,min(price-pred_price_regr) as min_price_diff\n    ,max(price-pred_price_regr) as max_price_diff\n    ,avg(price-pred_price_regr) as avg_price_diff\n    ,min(aepr_regr) as min_ratio_regr\n    ,max(aepr_regr) as max_ratio_regr\n    ,avg(aepr_regr) as avg_ratio_regr\nfrom df_model_preds;', 'SQL_DEEPNOTE_DATAFRAME_SQL')
df_1

Unnamed: 0,min_price,max_price,avg_price,min_price_pred,max_price_pred,avg_price_pred,min_price_diff,max_price_diff,avg_price_diff,min_ratio_regr,max_ratio_regr,avg_ratio_regr
0,10.0,1500.0,341.368923,-77.195554,1521.549654,341.774032,-798.922666,1006.812144,-0.405109,-147.864018,199.510848,1.108813


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb0b277f-d226-41e6-8798-2eb04c8159dd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>