# AEPR Analysis

### Import dependencies

In [None]:
import numpy as np
import pandas as pd
import scipy.stats

import plotly.express as px

try:
  from mlxtend.frequent_patterns import apriori
except:
  !pip install mlxtend
  from mlxtend.frequent_patterns import apriori

CITY = 'Nashville'

In [None]:
city_data_dict = {
    'Nashville': {
        'filepath': '../../data/',
        'preds_filename': 'processed/model_preds_nashville.csv',
        'feature_importances_filename': 'processed/feature_importances_nashville.csv'
        },
}

## Read and examine the data

In [None]:
def get_preds_data(city):
    path_to_use = city_data_dict[city]['filepath']
    input_filename = city_data_dict[city]['preds_filename']
    p_df = pd.read_csv(path_to_use+input_filename, index_col=0)
    p_df = p_df.dropna()

    feature_importances_filename = city_data_dict[city]['feature_importances_filename']
    fi_df = pd.read_csv(path_to_use+feature_importances_filename, index_col=0)

    return p_df, fi_df

In [None]:
preds_df, feature_importances_df = get_preds_data(CITY)
print(preds_df.shape)
preds_df.head()

(1685, 70)


Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,shopping_within_2_4,active_beyond_4,arts_beyond_4,food_beyond_4,nightlife_beyond_4,restaurants_beyond_4,shopping_beyond_4,price,pred_price_gb_regr,aepr_gb_regr
9090089,6,1.0,3.0,3.0,1.0,1125.0,1,1,36.12625,-86.7961,...,0,0,0,0,0,0,0,180.427397,278.365002,0.648168
42009956,5,1.0,2.0,3.0,1.9,1125.0,1,0,36.16581,-86.74321,...,0,0,0,0,0,0,0,119.021918,193.556162,0.614922
45556783,2,1.0,1.0,1.0,30.0,1125.0,1,0,36.16996,-86.75298,...,0,0,0,0,0,0,0,70.0,44.565634,1.570717
36386015,2,1.0,1.0,1.0,2.0,14.0,1,0,36.10966,-86.74022,...,0,0,0,0,1,2,0,64.69589,99.320921,0.651382
37936461,8,1.0,2.0,4.0,1.0,3.0,1,1,36.13692,-86.85645,...,0,0,0,0,1,6,0,159.076712,237.325286,0.67029


### Add indicator columns based on property features

In [None]:

preds_df['accommodates_lt5'] = np.where((preds_df['accommodates'] > 0) & (preds_df['accommodates'] <= 4), 1, 0)
preds_df['accommodates_5_8'] = np.where((preds_df['accommodates'] > 4) & (preds_df['accommodates'] <= 8), 1, 0)
preds_df['accommodates_gt8'] = np.where((preds_df['accommodates'] > 9) & (preds_df['accommodates'] <= 9999), 1, 0)

preds_df['num_bathrooms_eq1'] = np.where((preds_df['num_bathrooms'] > 0) & (preds_df['num_bathrooms'] <= 1), 1, 0)
preds_df['num_bathrooms_eq2'] = np.where((preds_df['num_bathrooms'] > 1) & (preds_df['num_bathrooms'] <= 2), 1, 0)
preds_df['num_bathrooms_gt2'] = np.where((preds_df['num_bathrooms'] > 2) & (preds_df['num_bathrooms'] <= 9999), 1, 0)

preds_df['bedrooms_eq1'] = np.where((preds_df['bedrooms'] > 0) & (preds_df['bedrooms'] <= 1), 1, 0)
preds_df['bedrooms_eq2'] = np.where((preds_df['bedrooms'] > 1) & (preds_df['bedrooms'] <= 2), 1, 0)
preds_df['bedrooms_gt2'] = np.where((preds_df['bedrooms'] > 2) & (preds_df['bedrooms'] <= 9999), 1, 0)

preds_df['beds_lt3'] = np.where((preds_df['beds'] > 0) & (preds_df['beds'] <= 2), 1, 0)
preds_df['beds_3_4'] = np.where((preds_df['beds'] > 2) & (preds_df['beds'] <= 4), 1, 0)
preds_df['beds_gt4'] = np.where((preds_df['beds'] > 4) & (preds_df['beds'] <= 9999), 1, 0)

In [None]:
preds_df.head()

Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,accommodates_gt8,num_bathrooms_eq1,num_bathrooms_eq2,num_bathrooms_gt2,bedrooms_eq1,bedrooms_eq2,bedrooms_gt2,beds_lt3,beds_3_4,beds_gt4
9090089,6,1.0,3.0,3.0,1.0,1125.0,1,1,36.12625,-86.7961,...,0,1,0,0,0,0,1,0,1,0
42009956,5,1.0,2.0,3.0,1.9,1125.0,1,0,36.16581,-86.74321,...,0,1,0,0,0,1,0,0,1,0
45556783,2,1.0,1.0,1.0,30.0,1125.0,1,0,36.16996,-86.75298,...,0,1,0,0,1,0,0,1,0,0
36386015,2,1.0,1.0,1.0,2.0,14.0,1,0,36.10966,-86.74022,...,0,1,0,0,1,0,0,1,0,0
37936461,8,1.0,2.0,4.0,1.0,3.0,1,1,36.13692,-86.85645,...,0,1,0,0,0,1,0,0,1,0


In [None]:
preds_df.columns

Index(['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
       'host_is_superhost_ind', 'latitude', 'longitude', 'Free Parking',
       'Kitchen Appliances', 'Patio or Balcony', 'Kitchen', 'Hair Dryer',
       'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
       'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
       'Self Check-in', 'Private Entrance', 'First Aid Kit',
       'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
       'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
       'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
       'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
       'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
       'Exercise Equipment', 'View', 'active_within_0_2', 'arts_within_0_2',
       'food_within_0_2', 'nightlife_within_0_2', 

### Categorize the DataFrame columns

In [None]:
cols_by_cat = {
    'property_details': ['accommodates', 'num_bathrooms', 'bedrooms', 'beds',
        'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_ind',
        'host_is_superhost_ind', 'latitude', 'longitude'],
    'amenities': ['Free Parking', 'Kitchen Appliances', 
        'Patio or Balcony', 'Kitchen', 'Hair Dryer',
        'Long Term Stays Allowed', 'Toiletries', 'Kitchen Essentials',
        'Hot Water', 'Fire Extinguisher', 'Carbon Monoxide Alarm', 'Bed Linens',
        'Self Check-in', 'Private Entrance', 'First Aid Kit',
        'Extra Pillows and Blankets', 'Dedicated Workspace', 'Surveillance',
        'Backyard', 'Clothing Storage', 'Wine Glasses', 'Cleaning Products',
        'Keypad', 'BBQ', 'Shades', 'Luggage Dropoff Allowed', 'Smart Lock',
        'Pets Allowed', 'Baby Essentials', 'Pool', 'Gym', 'Fire Pit',
        'Elevator', 'Lockbox', 'Fireplace', 'Paid Parking', 'Laundromat Nearby',
        'Exercise Equipment', 'View'],
    'attractions': ['active_within_0_2', 'active_within_2_4', 'active_beyond_4',
                    'arts_within_0_2', 'arts_within_2_4', 'arts_beyond_4',
                    'food_within_0_2', 'food_within_2_4', 'food_beyond_4',
                    'nightlife_within_0_2', 'nightlife_within_2_4', 'nightlife_beyond_4', 
                    'restaurants_within_0_2', 'restaurants_within_2_4', 'restaurants_beyond_4',
                    'shopping_within_0_2', 'shopping_within_2_4', 'shopping_beyond_4'],
    'price_details': ['price', 'pred_price_gb_regr', 'aepr_gb_regr'],
    'property_indicators':  ['accommodates_lt5', 'accommodates_5_8', 'accommodates_gt8', 
                            'num_bathrooms_eq1', 'num_bathrooms_eq2', 'num_bathrooms_gt2',
                            'bedrooms_eq1', 'bedrooms_eq2', 'bedrooms_gt2',
                            'beds_lt3', 'beds_3_4', 'beds_gt4'],
}

print('property details:', len(cols_by_cat['property_details']))
print('amenities:', len(cols_by_cat['amenities']))
print('attractions:', len(cols_by_cat['attractions']))
print('price details:', len(cols_by_cat['price_details']))
print('property indicators:', len(cols_by_cat['property_indicators']))

property details: 10
amenities: 39
attractions: 18
price details: 3
property indicators: 12


## Create a treemap of feature importances

In [None]:
# create a mapping of column name to column category
map_col_to_cat = dict()
for k, v in cols_by_cat.items():
    for col in v:
        map_col_to_cat[col] = k

map_col_to_cat

{'accommodates': 'property_details',
 'num_bathrooms': 'property_details',
 'bedrooms': 'property_details',
 'beds': 'property_details',
 'minimum_nights_avg_ntm': 'property_details',
 'maximum_nights_avg_ntm': 'property_details',
 'availability_ind': 'property_details',
 'host_is_superhost_ind': 'property_details',
 'latitude': 'property_details',
 'longitude': 'property_details',
 'Free Parking': 'amenities',
 'Kitchen Appliances': 'amenities',
 'Patio or Balcony': 'amenities',
 'Kitchen': 'amenities',
 'Hair Dryer': 'amenities',
 'Long Term Stays Allowed': 'amenities',
 'Toiletries': 'amenities',
 'Kitchen Essentials': 'amenities',
 'Hot Water': 'amenities',
 'Fire Extinguisher': 'amenities',
 'Carbon Monoxide Alarm': 'amenities',
 'Bed Linens': 'amenities',
 'Self Check-in': 'amenities',
 'Private Entrance': 'amenities',
 'First Aid Kit': 'amenities',
 'Extra Pillows and Blankets': 'amenities',
 'Dedicated Workspace': 'amenities',
 'Surveillance': 'amenities',
 'Backyard': 'ameniti

In [None]:
# create a df with columns necessary for treemap creation
treemap_df = feature_importances_df.copy()
treemap_df['feature'] = treemap_df.index
treemap_df['feature_category'] = treemap_df['feature'].map(map_col_to_cat)
tm_grouped_by_cat = treemap_df[['feature_category', 'importance']].groupby('feature_category').sum()
tm_grouped_by_cat = tm_grouped_by_cat.rename(columns={'importance': 'cat_total_importance'})
treemap_df = pd.merge(treemap_df, tm_grouped_by_cat, left_on='feature_category', right_on='feature_category')
treemap_df['importance_scaled'] = treemap_df['importance'] / treemap_df['cat_total_importance']
treemap_df['feature_alt'] = np.where(treemap_df['importance_scaled'] >= 0.02, treemap_df['feature'], 'All Others')
treemap_df['feature_alt'] = np.where(treemap_df['feature_alt']=='minimum_nights_avg_ntm', 'min_nights_avg', treemap_df['feature_alt'])
treemap_df['feature_alt'] = np.where(treemap_df['feature_alt']=='maximum_nights_avg_ntm', 'max_nights_avg', treemap_df['feature_alt'])
treemap_df = treemap_df[treemap_df['importance']!=0]

treemap_df

Unnamed: 0,importance,feature,feature_category,cat_total_importance,importance_scaled,feature_alt
0,0.176089,accommodates,property_details,0.848132,0.207620,accommodates
1,0.284808,num_bathrooms,property_details,0.848132,0.335807,num_bathrooms
2,0.174011,bedrooms,property_details,0.848132,0.205169,bedrooms
3,0.068160,beds,property_details,0.848132,0.080365,beds
4,0.051854,minimum_nights_avg_ntm,property_details,0.848132,0.061140,min_nights_avg
...,...,...,...,...,...,...
62,0.000127,arts_beyond_4,attractions,0.086693,0.001459,All Others
63,0.001683,food_beyond_4,attractions,0.086693,0.019415,All Others
64,0.005061,nightlife_beyond_4,attractions,0.086693,0.058382,nightlife_beyond_4
65,0.004906,restaurants_beyond_4,attractions,0.086693,0.056586,restaurants_beyond_4


In [None]:
# create and show the treemap
fig = px.treemap(treemap_df, 
                    path=[px.Constant('All'), 'feature_category', 'feature_alt'], 
                    values='importance',
                    color='importance',
                    color_continuous_scale='RdBu_r',
                    title='Treemap of Price Prediction Model Feature Importance'
                )
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25), width=800,height=600)
fig.update_layout()
fig.show()

## Examine the distribution of AEPR values

In [None]:
fig = px.histogram(preds_df, x='aepr_gb_regr')
fig.show()

## Compare the average feature value across AEPR quantiles

### Split the data into quantiles based on AEPR 

In [None]:
quartile_labels = ['q1', 'q2', 'q3', 'q4']
preds_df['quartile'] = pd.qcut(preds_df['aepr_gb_regr'], len(quartile_labels), labels=quartile_labels)
preds_df['quartile_bin'] = pd.qcut(preds_df['aepr_gb_regr'], len(quartile_labels))

quintile_labels = ['quint1', 'quint2', 'quint3', 'quint4', 'quint5']
preds_df['quintile'] = pd.qcut(preds_df['aepr_gb_regr'], len(quintile_labels), labels=quintile_labels)
preds_df['quintile_bin'] = pd.qcut(preds_df['aepr_gb_regr'], len(quintile_labels))

tertile_labels = ['t1', 't2', 't3']
preds_df['tertile'] = pd.qcut(preds_df['aepr_gb_regr'], len(tertile_labels), labels=tertile_labels)
preds_df['tertile_bin'] = pd.qcut(preds_df['aepr_gb_regr'], len(tertile_labels))

preds_df.head()

Unnamed: 0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,bedrooms_gt2,beds_lt3,beds_3_4,beds_gt4,quartile,quartile_bin,quintile,quintile_bin,tertile,tertile_bin
9090089,6,1.0,3.0,3.0,1.0,1125.0,1,1,36.12625,-86.7961,...,1,0,1,0,q1,"(0.196, 0.787]",quint1,"(0.196, 0.749]",t1,"(0.196, 0.841]"
42009956,5,1.0,2.0,3.0,1.9,1125.0,1,0,36.16581,-86.74321,...,0,0,1,0,q1,"(0.196, 0.787]",quint1,"(0.196, 0.749]",t1,"(0.196, 0.841]"
45556783,2,1.0,1.0,1.0,30.0,1125.0,1,0,36.16996,-86.75298,...,0,1,0,0,q4,"(1.155, 5.0]",quint5,"(1.225, 5.0]",t3,"(1.074, 5.0]"
36386015,2,1.0,1.0,1.0,2.0,14.0,1,0,36.10966,-86.74022,...,0,1,0,0,q1,"(0.196, 0.787]",quint1,"(0.196, 0.749]",t1,"(0.196, 0.841]"
37936461,8,1.0,2.0,4.0,1.0,3.0,1,1,36.13692,-86.85645,...,0,0,1,0,q1,"(0.196, 0.787]",quint1,"(0.196, 0.749]",t1,"(0.196, 0.841]"


### Calculate mean values by AEPR quantile group

In [None]:
all_features = cols_by_cat['property_details'] + cols_by_cat['property_indicators'] + cols_by_cat['amenities'] + cols_by_cat['attractions']
cols = all_features + cols_by_cat['price_details'] + ['quartile']
mean_values_by_quartile = preds_df[cols].groupby(['quartile']).mean()
mean_values_by_quartile

Unnamed: 0_level_0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_beyond_4,restaurants_within_0_2,restaurants_within_2_4,restaurants_beyond_4,shopping_within_0_2,shopping_within_2_4,shopping_beyond_4,price,pred_price_gb_regr,aepr_gb_regr
quartile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
q1,6.725118,2.046209,2.459716,3.914692,9.315877,737.028673,0.976303,0.549763,36.165352,-86.766043,...,0.5,10.291469,5.28436,2.194313,0.037915,0.009479,0.007109,213.636975,324.737648,0.650664
q2,7.377672,2.152019,2.456057,4.458432,5.662945,743.839667,0.990499,0.586698,36.163875,-86.771066,...,0.299287,11.767221,4.446556,1.532067,0.049881,0.007126,0.011876,311.611681,355.724677,0.873672
q3,7.971496,2.438242,2.819477,5.220903,6.985036,734.542993,0.992874,0.570071,36.163801,-86.774548,...,0.377672,11.275534,4.874109,1.574822,0.083135,0.007126,0.0,405.294966,385.638561,1.049764
q4,6.712589,2.042755,2.380048,3.916865,8.587173,721.798575,0.988124,0.470309,36.162609,-86.768615,...,0.60095,10.387173,5.087886,2.235154,0.092637,0.014252,0.007126,436.350913,305.34272,1.485605


In [None]:
cols = all_features + cols_by_cat['price_details'] + ['quintile']
mean_values_by_quintile = preds_df[cols].groupby(['quintile']).mean()
mean_values_by_quintile

Unnamed: 0_level_0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_beyond_4,restaurants_within_0_2,restaurants_within_2_4,restaurants_beyond_4,shopping_within_0_2,shopping_within_2_4,shopping_beyond_4,price,pred_price_gb_regr,aepr_gb_regr
quintile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
quint1,6.409496,1.989614,2.397626,3.712166,10.627893,716.889318,0.976261,0.525223,36.165818,-86.764238,...,0.563798,9.973294,5.37092,2.424332,0.029674,0.011869,0.008902,192.425181,308.839571,0.62126
quint2,7.382789,2.163205,2.477745,4.495549,4.851632,776.083383,0.982196,0.617211,36.163364,-86.771854,...,0.308605,11.385757,4.804154,1.51632,0.05638,0.005935,0.011869,292.604559,356.799129,0.819787
quint3,7.84273,2.25816,2.655786,4.851632,5.118101,734.918694,0.994065,0.548961,36.165042,-86.771735,...,0.311573,11.845697,4.4273,1.486647,0.062315,0.008902,0.002967,358.385196,372.474063,0.961933
quint4,7.718101,2.394659,2.715134,4.946588,8.353116,701.250148,0.994065,0.554896,36.163453,-86.773241,...,0.4273,11.163205,4.866469,1.709199,0.089021,0.002967,0.0,415.522572,375.351327,1.110742
quint5,6.62908,2.043027,2.397626,3.881306,9.243027,742.378932,0.988131,0.474777,36.161873,-86.769259,...,0.611276,10.281899,5.148368,2.284866,0.091988,0.017804,0.008902,449.300581,300.786639,1.559828


In [None]:
cols = all_features + cols_by_cat['price_details'] + ['tertile']
mean_values_by_tertile = preds_df[cols].groupby(['tertile']).mean()
mean_values_by_tertile

Unnamed: 0_level_0,accommodates,num_bathrooms,bedrooms,beds,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_ind,host_is_superhost_ind,latitude,longitude,...,nightlife_beyond_4,restaurants_within_0_2,restaurants_within_2_4,restaurants_beyond_4,shopping_within_0_2,shopping_within_2_4,shopping_beyond_4,price,pred_price_gb_regr,aepr_gb_regr
tertile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
t1,6.80427,2.057829,2.437722,3.991103,8.338612,740.885765,0.976868,0.553381,36.164811,-86.76811,...,0.464413,10.501779,5.225979,2.033808,0.042705,0.008897,0.007117,228.36399,327.203487,0.691292
t2,7.670232,2.285205,2.638146,4.834225,6.973262,741.102139,0.99287,0.588235,36.163725,-86.771166,...,0.306595,11.746881,4.493761,1.490196,0.067736,0.00713,0.00713,358.130164,372.18015,0.960773
t3,7.115658,2.16637,2.510676,4.307829,7.603203,720.936477,0.991103,0.491103,36.163194,-86.770922,...,0.562278,10.542705,5.049822,2.128114,0.087189,0.012456,0.005338,438.478029,329.21899,1.39197


### Scale the quantile averages by the overall mean for each feature

In [None]:
mean_values = pd.DataFrame(preds_df[cols].mean(), columns=['mean_value'])
mean_values_list = [mean_values, mean_values_by_quartile, mean_values_by_quintile, mean_values_by_tertile]

mean_values_merged = mean_values_list[0].copy()
for mv_df in mean_values_list[1:]:
    mean_values_merged = pd.merge(mean_values_merged, 
                            mv_df.transpose(), 
                            left_index=True, right_index=True)

for col in mean_values_merged.columns:
    if col != 'mean_value':
        mean_values_merged[col+'_scaled'] = np.round(mean_values_merged[col] / mean_values_merged['mean_value'], 3)

mean_values_merged.sample(10)

Unnamed: 0,mean_value,q1,q2,q3,q4,quint1,quint2,quint3,quint4,quint5,...,q3_scaled,q4_scaled,quint1_scaled,quint2_scaled,quint3_scaled,quint4_scaled,quint5_scaled,t1_scaled,t2_scaled,t3_scaled
Fireplace,0.136499,0.175355,0.135392,0.147268,0.087886,0.175074,0.142433,0.139466,0.139466,0.086053,...,1.079,0.644,1.283,1.043,1.022,1.022,0.63,1.264,0.979,0.756
Luggage Dropoff Allowed,0.241543,0.263033,0.230404,0.19715,0.275534,0.228487,0.272997,0.219585,0.186944,0.299703,...,0.816,1.141,0.946,1.13,0.909,0.774,1.241,1.061,0.886,1.053
arts_within_2_4,0.079525,0.078199,0.066508,0.068884,0.104513,0.080119,0.083086,0.071217,0.062315,0.10089,...,0.866,1.314,1.007,1.045,0.896,0.784,1.269,1.052,0.785,1.163
nightlife_within_0_2,4.335312,4.011848,4.76247,4.489311,4.078385,3.872404,4.560831,4.774481,4.41543,4.053412,...,1.036,0.941,0.893,1.052,1.101,1.018,0.935,0.951,1.097,0.952
minimum_nights_avg_ntm,7.638754,9.315877,5.662945,6.985036,8.587173,10.627893,4.851632,5.118101,8.353116,9.243027,...,0.914,1.124,1.391,0.635,0.67,1.094,1.21,1.092,0.913,0.995
Fire Pit,0.157864,0.225118,0.142518,0.123515,0.140143,0.210682,0.189911,0.118694,0.121662,0.148368,...,0.782,0.888,1.335,1.203,0.752,0.771,0.94,1.319,0.802,0.879
active_within_2_4,0.023145,0.014218,0.033254,0.014252,0.030879,0.017804,0.023739,0.023739,0.017804,0.032641,...,0.616,1.334,0.769,1.026,1.026,0.769,1.41,0.923,0.847,1.23
bedrooms_gt2,0.462908,0.436019,0.448931,0.543943,0.422803,0.41543,0.445104,0.522255,0.522255,0.409496,...,1.175,0.913,0.897,0.962,1.128,1.128,0.885,0.946,1.067,0.988
Gym,0.175668,0.158768,0.187648,0.173397,0.182898,0.151335,0.178042,0.186944,0.189911,0.172107,...,0.987,1.041,0.861,1.014,1.064,1.081,0.98,0.922,1.045,1.033
Pool,0.18635,0.187204,0.192399,0.159145,0.206651,0.189911,0.178042,0.175074,0.189911,0.198813,...,0.854,1.109,1.019,0.955,0.939,1.019,1.067,0.993,0.899,1.108


### Create heatmaps by quantile for groups of features

In [None]:
def get_heatmap_fig(features, quantiles='quartiles', col_descr='', fig_text=''):
    if quantiles == 'tertiles':
        quants = ['t1_scaled', 't2_scaled', 't3_scaled']
    elif quantiles == 'quintiles':
        quants = ['quint1_scaled', 'quint2_scaled', 'quint3_scaled', 'quint4_scaled', 'quint5_scaled']
    else:
        quants = ['q1_scaled', 'q2_scaled', 'q3_scaled', 'q4_scaled']
        rpt_text_remap = {q:'Q'+ q[1] for q in quants}

    df = mean_values_merged[quants].loc[features]
    df = df.rename(columns=rpt_text_remap)
    rpt_title = ''
    if fig_text != '':
        rpt_title = rpt_title + fig_text
    if col_descr != '':
        rpt_title = rpt_title + 'Scaled Mean Values for ' + col_descr
    fig = px.imshow(df, color_continuous_scale='RdBu_r', text_auto=True, title=rpt_title)
    fig.update_layout(width=600,height=600)

    return fig

In [None]:
fig = get_heatmap_fig(cols_by_cat['property_details'], col_descr='Property Detail Features')
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['property_indicators'], col_descr='Property Size Indicators')
fig.show(text_auto=True)


In [None]:
fig = get_heatmap_fig(cols_by_cat['amenities'][:10], col_descr='Amenities')
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['amenities'][10:20])
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['amenities'][20:30])
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['amenities'][30:])
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['attractions'][:9])
fig.show()

In [None]:
fig = get_heatmap_fig(cols_by_cat['attractions'][9:])
fig.show()

## Create heatmaps to include in the report

In [None]:
fig = get_heatmap_fig(cols_by_cat['property_indicators'], col_descr='Property Size Indicators',
                        fig_text='Fig. 1 - ')
fig.show(text_auto=True)

In [None]:
# create for the report with selected features
cols = ['host_is_superhost_ind', 'Patio or Balcony', 'Backyard', 'Keypad', 'Baby Essentials', 
        'Pool', 'Fire Pit', 'Fireplace', 'View', 'arts_within_2_4', 'arts_beyond_4', 
        'food_beyond_4', 'nightlife_beyond_4', 'shopping_within_0_2', 'shopping_within_2_4']
fig = get_heatmap_fig(cols[:9], col_descr='Selected Features', fig_text='Fig. 2 - ')
fig.show()

In [None]:
fig = get_heatmap_fig(cols[9:], col_descr='Selected Attractions', fig_text='Fig. 3 - ')
fig.show()

## Examine the correlation between each feature and AEPR

In [None]:
# build a DataFrame of correlations by feature
feature_names = []
corr_coefs = []
all_features = cols_by_cat['property_details'] + cols_by_cat['property_indicators'] + cols_by_cat['amenities'] + cols_by_cat['attractions']
for feature in all_features:
    corr_coef, pval = scipy.stats.pearsonr(preds_df[feature], preds_df['aepr_gb_regr'])
    feature_names.append(feature)
    corr_coefs.append(corr_coef)

corr_coefs_df = pd.DataFrame({'feature': feature_names, 
                                'aepr_correlation': corr_coefs
                            })

# determine the quintile for each feature based on correlation
quintile_labels = ['quint1', 'quint2', 'quint3', 'quint4', 'quint5']
corr_coefs_df['quintile'] = pd.qcut(corr_coefs_df['aepr_correlation'], len(quintile_labels), labels=quintile_labels)
corr_coefs_df['quintile_bins'] = pd.qcut(corr_coefs_df['aepr_correlation'], len(quintile_labels))
corr_coefs_df 

Unnamed: 0,feature,aepr_correlation,quintile,quintile_bins
0,accommodates,-0.044743,quint1,"(-0.0861, -0.0325]"
1,num_bathrooms,-0.030809,quint2,"(-0.0325, -0.0129]"
2,bedrooms,-0.034141,quint1,"(-0.0861, -0.0325]"
3,beds,-0.031355,quint2,"(-0.0325, -0.0129]"
4,minimum_nights_avg_ntm,0.024705,quint4,"(0.0057, 0.026]"
...,...,...,...,...
74,restaurants_within_2_4,0.001212,quint3,"(-0.0129, 0.0057]"
75,restaurants_beyond_4,0.026234,quint5,"(0.026, 0.0937]"
76,shopping_within_0_2,0.083619,quint5,"(0.026, 0.0937]"
77,shopping_within_2_4,0.005757,quint4,"(0.0057, 0.026]"


In [None]:
# function to return (feature name, correlation value) for a specified quintile 
def get_feat_corr_pairs(quint):
    q_features = corr_coefs_df.loc[corr_coefs_df['quintile']==quint, 'feature']
    q_corr = corr_coefs_df.loc[corr_coefs_df['quintile']==quint, 'aepr_correlation']
    feature_corr_list = [a for a in zip(q_features, np.round(q_corr, 3))]
    return sorted(feature_corr_list, key = lambda x: x[1] if x[1]<0 else -x[1])

### Features that are most negatively correlated with AEPR

In [None]:
most_neg_corr = get_feat_corr_pairs('quint1')
for item in most_neg_corr:
    print(item)

### Features that are most positively correlated with AEPR

In [None]:
most_pos_corr = get_feat_corr_pairs('quint5')
for item in most_pos_corr:
    print(item)

## Perform Frequent Itemset Analysis on Property Features

Code developed with guidance from example at https://hands-on.cloud/apriori-algorithm-python-implementation/


In [None]:
cols_itemset_analysis = cols_by_cat['property_indicators'] + cols_by_cat['amenities'] 

In [None]:
# create a function that identifies frequent itemsets in each of two dfs
# and returns the items that are unique to each df
def compare_groups(df_a, df_b, min_support):
    
    freq_itemsets_a = apriori(df_a, min_support=min_support, use_colnames=True)
    freq_itemsets_a['length'] = freq_itemsets_a['itemsets'].apply(lambda x: len(x))
    
    freq_itemsets_b = apriori(df_b, min_support=min_support, use_colnames=True)
    freq_itemsets_b['length'] = freq_itemsets_b['itemsets'].apply(lambda x: len(x))
    
    frozenset_set_a = set(freq_itemsets_a['itemsets'])
    frozenset_set_b = set(freq_itemsets_b['itemsets'])

    a_not_b = frozenset_set_a - frozenset_set_b
    b_not_a = frozenset_set_b - frozenset_set_a

    return a_not_b, b_not_a


In [None]:
# create dfs containing the properties in the top and bottom quartiles of AEPR
preds_df_q1 = preds_df.loc[preds_df['quartile']=='q1', cols_itemset_analysis]
preds_df_q4 = preds_df.loc[preds_df['quartile']=='q4', cols_itemset_analysis]

### Test a few support threshold options

In [None]:
support_thresholds = [0.65, 0.70, 0.75]

itemsets_q1_not_q4 = []
itemsets_q4_not_q1 = []
len_q1_not_q4 = []
len_q4_not_q1 = []

for thresh in support_thresholds:
    q1_not_q4, q4_not_q1 = compare_groups(preds_df_q1, preds_df_q4, thresh)
    itemsets_q1_not_q4.append(q1_not_q4)
    itemsets_q4_not_q1.append(q4_not_q1)
    len_q1_not_q4.append(len(q1_not_q4))
    len_q4_not_q1.append(len(q4_not_q1))

df = pd.DataFrame({
        'support_threshold': support_thresholds,
        'itemsets_q1_not_q4': itemsets_q1_not_q4,
        'itemsets_q4_not_q1': itemsets_q4_not_q1,
        'len_q1_not_q4': len_q1_not_q4,
        'len_q4_not_q1': len_q4_not_q1,

})


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type


DataFrames with non-bool types result in worse computationalperformance and their support might be discontinued in the future.Please use a DataFrame with bool type



In [None]:
df

In [None]:
# select a final support threshold
final_support_thresh_idx = 1

In [None]:
# examine the itemsets from one iteration of the apriori method
for itemset in itemsets_q1_not_q4[final_support_thresh_idx]:
    print(set(itemset))

In [None]:
for itemset in itemsets_q4_not_q1[final_support_thresh_idx]:
    print(set(itemset))

In [None]:
# count the number of times each item appears in a list frequent itemsets
def get_item_counts(itemsets):
    item_counts = dict()
    for itemset in itemsets:
        for item in itemset:
            if item not in item_counts.keys():
                item_counts[item] = 1
            else:
                item_counts[item] += 1
        
    return dict(sorted(item_counts.items(), key=lambda item: -item[1]))

In [None]:
get_item_counts(itemsets_q1_not_q4[final_support_thresh_idx])

In [None]:
get_item_counts(itemsets_q4_not_q1[final_support_thresh_idx])

{'Kitchen Appliances': 15,
 'Kitchen': 14,
 'Long Term Stays Allowed': 13,
 'Kitchen Essentials': 13,
 'Toiletries': 8,
 'Bed Linens': 8,
 'Fire Extinguisher': 6,
 'Self Check-in': 5,
 'Hair Dryer': 4,
 'Hot Water': 2,
 'Free Parking': 2}

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=cb0b277f-d226-41e6-8798-2eb04c8159dd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>