In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
from sklearn.linear_model import LinearRegression
from functools import reduce

In [2]:
#Read csv file
df = pd.read_csv("Resources/test_yelp_zillow.csv")

In [3]:
df.shape

(30454, 17)

In [4]:
df.dtypes

Unnamed: 0        int64
business_id      object
city             object
state            object
postal_code       int64
stars           float64
review_count      int64
Diversity         int64
State            object
City             object
CountyName       object
2016-01-31      float64
2017              int64
2018              int64
2019              int64
2020              int64
2021              int64
dtype: object

In [5]:
#change datatype from int64 to Object
df["Diversity"] = df["Diversity"].astype(str)

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,business_id,city,state,postal_code,stars,review_count,Diversity,State,City,CountyName,2016-01-31,2017,2018,2019,2020,2021
0,0,mpf3x-BjTdTEA3yCZrAYPw,Affton,MO,63123,3.0,15,3,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
1,1,lB6jIOb1UBATmdfot4KJew,Saint Louis,MO,63123,4.0,126,2,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
2,2,fBO0Cb-tbx5fvaWsp4sKtw,Saint Louis,MO,63123,3.5,83,1,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
3,3,cXEx8dhmoFZcsJjXS2lLuQ,Saint Louis,MO,63123,3.5,51,4,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744
4,4,-D_L4GHXJQvy2wv0RkrupA,St. Louis,MO,63123,2.5,21,2,MO,Affton,Saint Louis County,139727.0,152645,158655,165220,173803,198744


In [7]:
#drop columns
col_drop = df[["2016-01-31", "2018", "2019", "2020", "2021", "city", "state", "business_id"]]
df = df.drop(col_drop, axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,postal_code,stars,review_count,Diversity,State,City,CountyName,2017
0,0,63123,3.0,15,3,MO,Affton,Saint Louis County,152645
1,1,63123,4.0,126,2,MO,Affton,Saint Louis County,152645
2,2,63123,3.5,83,1,MO,Affton,Saint Louis County,152645
3,3,63123,3.5,51,4,MO,Affton,Saint Louis County,152645
4,4,63123,2.5,21,2,MO,Affton,Saint Louis County,152645


In [8]:
#drop Unnamed:0
col = df.columns[0]
df = df.drop(col, axis=1)

In [9]:
df.head()

Unnamed: 0,postal_code,stars,review_count,Diversity,State,City,CountyName,2017
0,63123,3.0,15,3,MO,Affton,Saint Louis County,152645
1,63123,4.0,126,2,MO,Affton,Saint Louis County,152645
2,63123,3.5,83,1,MO,Affton,Saint Louis County,152645
3,63123,3.5,51,4,MO,Affton,Saint Louis County,152645
4,63123,2.5,21,2,MO,Affton,Saint Louis County,152645


In [10]:
df.shape

(30454, 8)

In [11]:
#postal_list = df["postal_code"].unique()

In [12]:
#for z_code in postal_list[0:1]:
    #df_zc = df.loc[df['postal_code']==z_code, 'Diversity']
    #category = df_zc.value_counts().to_frame()
    #print(z_code, category)
   
    


# Create a dataframe that has 

* Count of the number of restaurants groupby postal_code and count by Diversity
* Average of all the stars per category(Diversity) for that zip code
* Add all the review_count per category(Diversity) per zip code

In [13]:
# Create a Series of scores by Diversity levels using conditionals.
Mexican = df[(df["Diversity"]== "1")]
Chinese = df[(df["Diversity"]== "2")]
Italian = df[(df["Diversity"]== "3")]
Cafes = df[(df["Diversity"] == "4")]
Fastfood = df[(df["Diversity"] == "5")]

print(f" Mexican: {Mexican.shape}")
print(f" Chinese: {Chinese.shape}")
print(f" Italian: {Italian.shape}")
print(f" Cafes:   {Cafes.shape}")
print(f" Fastfood:{Fastfood.shape}")

 Mexican: (6061, 8)
 Chinese: (5955, 8)
 Italian: (6183, 8)
 Cafes:   (6176, 8)
 Fastfood:(6079, 8)


### Create Category Dataframe

In [15]:
# Group each Diversity Series by the postal_code 
M1 = Mexican.groupby(['postal_code']).count()['Diversity']
print(M1)
C2 = Chinese.groupby(['postal_code']).count()['Diversity']
I1 = Italian.groupby(['postal_code']).count()['Diversity']
Cafe1 = Cafes.groupby(['postal_code']).count()['Diversity']
Fast1 = Fastfood.groupby(['postal_code']).count()['Diversity']

postal_code
18054     1
18073     1
18901    20
18902     2
18912     3
         ..
93105    18
93108    11
93109     6
93111     1
93117    25
Name: Diversity, Length: 588, dtype: int64


In [16]:
#Create category Dataframe
category_df = pd.DataFrame({'Mexican': M1,
                                       'Chinese': C2, 
                                       'Italian': I1,
                                       'Cafe': Cafe1,
                                       'Fastfood': Fast1})
category_df.head()

Unnamed: 0_level_0,Mexican,Chinese,Italian,Cafe,Fastfood
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,1.0,1.0,1.0,2.0,1.0
18073,1.0,3.0,1.0,3.0,3.0
18074,,,3.0,1.0,
18076,,,,1.0,
18084,,,1.0,,1.0


In [17]:
#replace NaN with 0
category_df = category_df.fillna(0)
category_df.head()

Unnamed: 0_level_0,Mexican,Chinese,Italian,Cafe,Fastfood
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,1.0,1.0,1.0,2.0,1.0
18073,1.0,3.0,1.0,3.0,3.0
18074,0.0,0.0,3.0,1.0,0.0
18076,0.0,0.0,0.0,1.0,0.0
18084,0.0,0.0,1.0,0.0,1.0


In [18]:
#check with a postal code if values are correct
df.loc[df["postal_code"] == 18076]

Unnamed: 0,postal_code,stars,review_count,Diversity,State,City,CountyName,2017
30430,18076,4.5,20,4,PA,Red Hill,Montgomery County,197720


### Create Stars dataframe

In [19]:
# Get Mean of "stars" for each postal_code and Diversity
Mstar = Mexican.groupby(['postal_code']).mean()['stars'].round(1)
Cstar = Chinese.groupby(['postal_code']).mean()['stars'].round(1)
Istar = Italian.groupby(['postal_code']).mean()['stars'].round(1)
Castar = Cafes.groupby(['postal_code']).mean()['stars'].round(1)
Fastar = Fastfood.groupby(['postal_code']).mean()['stars'].round(1)


In [20]:
#Create star Dataframe
star_df = pd.DataFrame({'Mexican_stars': Mstar,
                                       'Chinese_stars': Cstar, 
                                       'Italian_stars': Istar,
                                       'Cafe_stars': Castar,
                                       'Fastfood_stars': Fastar})
star_df.head()

Unnamed: 0_level_0,Mexican_stars,Chinese_stars,Italian_stars,Cafe_stars,Fastfood_stars
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,4.5,3.5,3.5,4.5,5.0
18073,2.0,3.3,3.5,3.3,2.7
18074,,,4.0,4.0,
18076,,,,4.5,
18084,,,4.0,,4.0


In [21]:
#replace NaN with 0
star_df = star_df.fillna(0)
star_df.head()

Unnamed: 0_level_0,Mexican_stars,Chinese_stars,Italian_stars,Cafe_stars,Fastfood_stars
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,4.5,3.5,3.5,4.5,5.0
18073,2.0,3.3,3.5,3.3,2.7
18074,0.0,0.0,4.0,4.0,0.0
18076,0.0,0.0,0.0,4.5,0.0
18084,0.0,0.0,4.0,0.0,4.0


### Create Reviews dataframe

In [22]:
# Get sum of all reviews for each postal_code and Diversity
Mreview = Mexican.groupby(['postal_code']).sum()['review_count']
print(Mreview)
Creview = Chinese.groupby(['postal_code']).sum()['review_count']
Ireview = Italian.groupby(['postal_code']).sum()['review_count']
Careview = Cafes.groupby(['postal_code']).sum()['review_count']
Freview = Fastfood.groupby(['postal_code']).sum()['review_count']

postal_code
18054      13
18073      17
18901    1375
18902     112
18912     165
         ... 
93105    4596
93108    1607
93109    1425
93111     365
93117    2967
Name: review_count, Length: 588, dtype: int64


In [23]:
#Create review Dataframe
review_df = pd.DataFrame({'Mexican_review_count': Mreview,
                                       'Chinese_review_count': Creview, 
                                       'Italian__review_count': Ireview,
                                       'Cafe_review_count': Careview,
                                       'Fastfood_review_count': Freview})
review_df.head()

Unnamed: 0_level_0,Mexican_review_count,Chinese_review_count,Italian__review_count,Cafe_review_count,Fastfood_review_count
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,13.0,25.0,7.0,14.0,9.0
18073,17.0,74.0,5.0,150.0,109.0
18074,,,71.0,16.0,
18076,,,,20.0,
18084,,,16.0,,20.0


In [24]:
#replace NaN with 0
review_df = review_df.fillna(0)
review_df.head()

Unnamed: 0_level_0,Mexican_review_count,Chinese_review_count,Italian__review_count,Cafe_review_count,Fastfood_review_count
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
18054,13.0,25.0,7.0,14.0,9.0
18073,17.0,74.0,5.0,150.0,109.0
18074,0.0,0.0,71.0,16.0,0.0
18076,0.0,0.0,0.0,20.0,0.0
18084,0.0,0.0,16.0,0.0,20.0


### Merge the dataframes

In [25]:

frames = [category_df, star_df, review_df]


In [26]:
df_final = reduce(lambda left,right: pd.merge(left,right,on='postal_code'), frames)

In [27]:
df_final.head()

Unnamed: 0_level_0,Mexican,Chinese,Italian,Cafe,Fastfood,Mexican_stars,Chinese_stars,Italian_stars,Cafe_stars,Fastfood_stars,Mexican_review_count,Chinese_review_count,Italian__review_count,Cafe_review_count,Fastfood_review_count
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
18054,1.0,1.0,1.0,2.0,1.0,4.5,3.5,3.5,4.5,5.0,13.0,25.0,7.0,14.0,9.0
18073,1.0,3.0,1.0,3.0,3.0,2.0,3.3,3.5,3.3,2.7,17.0,74.0,5.0,150.0,109.0
18074,0.0,0.0,3.0,1.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,71.0,16.0,0.0
18076,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,20.0,0.0
18084,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,16.0,0.0,20.0


In [28]:
df_final.shape

(650, 15)

### Create a dataframe to define neighborhoods

In [29]:
#filtering dataframe df to showcase houseprices per county per postal_code
neighborhood_df = df[["postal_code", "CountyName", "2017"]]

In [30]:
neighborhood_df.head(20)

Unnamed: 0,postal_code,CountyName,2017
0,63123,Saint Louis County,152645
1,63123,Saint Louis County,152645
2,63123,Saint Louis County,152645
3,63123,Saint Louis County,152645
4,63123,Saint Louis County,152645
5,63123,Saint Louis County,152645
6,63123,Saint Louis County,152645
7,63123,Saint Louis County,152645
8,63123,Saint Louis County,152645
9,63123,Saint Louis County,152645


In [31]:
#check for duplicates
neighborhood_df["postal_code"].duplicated(keep='first').sum()

29804

In [32]:
neighborhood_df["postal_code"].value_counts()

70130    361
19107    323
37203    305
19147    260
19104    247
        ... 
34604      1
37026      1
28801      1
19520      1
85653      1
Name: postal_code, Length: 650, dtype: int64

In [33]:
#drop duplicates
neighborhood_df = neighborhood_df.drop_duplicates()

In [34]:
neighborhood_df.shape

(650, 3)

In [35]:
neighborhood_df.head(20)

Unnamed: 0,postal_code,CountyName,2017
0,63123,Saint Louis County,152645
62,19107,Philadelphia County,311628
385,37015,Cheatham County,178560
405,37207,Davidson County,202363
457,33602,Hillsborough County,311364
622,46227,Marion County,132797
745,19106,Philadelphia County,385030
884,19147,Philadelphia County,380347
1144,46250,Marion County,205275
1264,89502,Washoe County,239851


In [36]:
# Calculate the average house price per county
per_county_mean_price = neighborhood_df.groupby(['CountyName']).mean()['2017'].round(0)

In [37]:
per_county_mean_price

CountyName
Ada County               282338.0
Berks County             194420.0
Boone County             311783.0
Bucks County             366606.0
Buncombe County          335640.0
Cannon County            140111.0
Cheatham County          221621.0
Chester County           375250.0
Cochise County           318185.0
Davidson County          320490.0
Dekalb County            120969.0
Delaware County          258208.0
Duval County             185298.0
Fayette County           297127.0
Hamilton County          307620.0
Hancock County           209791.0
Hendricks County         192964.0
Hernando County          150363.0
Hillsborough County      236447.0
Jefferson County         173781.0
Jefferson Parish         205778.0
Johnson County           176369.0
Madison County           122994.0
Manatee County           219548.0
Marion County            145908.0
Monroe County            194700.0
Montgomery County        342479.0
Morgan County            176770.0
Nevada County            655230.0
New

In [38]:
# create dataframe that shows Average County House Price
county_price_df = pd.DataFrame({"Average County House Price": per_county_mean_price})

In [39]:
county_price_df.head()

Unnamed: 0_level_0,Average County House Price
CountyName,Unnamed: 1_level_1
Ada County,282338.0
Berks County,194420.0
Boone County,311783.0
Bucks County,366606.0
Buncombe County,335640.0


In [40]:
# Merge dataframes
frames2 = [neighborhood_df, county_price_df]
neighborhood_df2 = reduce(lambda left,right: pd.merge(left,right,on='CountyName'), frames2)

In [41]:
neighborhood_df2.head(10)

Unnamed: 0,postal_code,CountyName,2017,Average County House Price
0,63123,Saint Louis County,152645,216979.0
1,63122,Saint Louis County,330901,216979.0
2,63136,Saint Louis County,34029,216979.0
3,63033,Saint Louis County,110009,216979.0
4,63017,Saint Louis County,370736,216979.0
5,63141,Saint Louis County,432247,216979.0
6,63011,Saint Louis County,289950,216979.0
7,63131,Saint Louis County,588293,216979.0
8,63021,Saint Louis County,251257,216979.0
9,63026,Saint Louis County,225149,216979.0


In [42]:
#Calculate multiplier/indicator to address outliers
neighborhood_df2["Neighborhood Indicator"] = neighborhood_df2["2017"]/neighborhood_df2["Average County House Price"]

In [43]:
neighborhood_df2.head(20)

Unnamed: 0,postal_code,CountyName,2017,Average County House Price,Neighborhood Indicator
0,63123,Saint Louis County,152645,216979.0,0.703501
1,63122,Saint Louis County,330901,216979.0,1.525037
2,63136,Saint Louis County,34029,216979.0,0.156831
3,63033,Saint Louis County,110009,216979.0,0.507003
4,63017,Saint Louis County,370736,216979.0,1.708626
5,63141,Saint Louis County,432247,216979.0,1.992114
6,63011,Saint Louis County,289950,216979.0,1.336304
7,63131,Saint Louis County,588293,216979.0,2.71129
8,63021,Saint Louis County,251257,216979.0,1.157978
9,63026,Saint Louis County,225149,216979.0,1.037653


### Create neighborhood tiers
* Tier1: >=1.18
* Tier2: 0.71>Tier2>1.18
* Tier3: <=0.71

In [44]:
# check ranges for determining bins

#check the lower quantile indicator
neighborhood_df2["Neighborhood Indicator"].quantile(q=0.25)


0.7117787417366801

In [45]:
#check the upper quantile indicator
neighborhood_df2["Neighborhood Indicator"].quantile(q=0.75)

1.1834832916312188

In [46]:
#check the max indicator
neighborhood_df2["Neighborhood Indicator"].max()

3.8277344812170764

In [47]:
# Cut the neighborhood indicator into the tiers.
spending_bins = [0, 0.71, 1.18, 4]
group_names = ["Tier3", "Tier2", "Tier1"]


In [48]:
# Categorize neighborhood indicators based on the bins.
neighborhood_df2["Neighborhood Tiers"] = pd.cut(neighborhood_df2["Neighborhood Indicator"], spending_bins, labels=group_names)
neighborhood_df2.head(20)

Unnamed: 0,postal_code,CountyName,2017,Average County House Price,Neighborhood Indicator,Neighborhood Tiers
0,63123,Saint Louis County,152645,216979.0,0.703501,Tier3
1,63122,Saint Louis County,330901,216979.0,1.525037,Tier1
2,63136,Saint Louis County,34029,216979.0,0.156831,Tier3
3,63033,Saint Louis County,110009,216979.0,0.507003,Tier3
4,63017,Saint Louis County,370736,216979.0,1.708626,Tier1
5,63141,Saint Louis County,432247,216979.0,1.992114,Tier1
6,63011,Saint Louis County,289950,216979.0,1.336304,Tier1
7,63131,Saint Louis County,588293,216979.0,2.71129,Tier1
8,63021,Saint Louis County,251257,216979.0,1.157978,Tier2
9,63026,Saint Louis County,225149,216979.0,1.037653,Tier2


In [49]:
# Make a copy of neighborhood_df2 for future calculations
neighborhood_copy_df = neighborhood_df2.copy()

In [50]:
# drop unneeded columns
col_drop = neighborhood_copy_df.columns[1:5]
neighborhood_copy_df = neighborhood_copy_df.drop(col_drop, axis=1)

In [51]:
col_drop

Index(['CountyName', '2017', 'Average County House Price',
       'Neighborhood Indicator'],
      dtype='object')

In [52]:
# merge dataframes
frames3 = [df_final, neighborhood_copy_df]
joined_df = reduce(lambda left,right: pd.merge(left,right,on='postal_code'), frames3)

In [53]:
joined_df.head()

Unnamed: 0,postal_code,Mexican,Chinese,Italian,Cafe,Fastfood,Mexican_stars,Chinese_stars,Italian_stars,Cafe_stars,Fastfood_stars,Mexican_review_count,Chinese_review_count,Italian__review_count,Cafe_review_count,Fastfood_review_count,Neighborhood Tiers
0,18054,1.0,1.0,1.0,2.0,1.0,4.5,3.5,3.5,4.5,5.0,13.0,25.0,7.0,14.0,9.0,Tier2
1,18073,1.0,3.0,1.0,3.0,3.0,2.0,3.3,3.5,3.3,2.7,17.0,74.0,5.0,150.0,109.0,Tier2
2,18074,0.0,0.0,3.0,1.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,71.0,16.0,0.0,Tier2
3,18076,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,20.0,0.0,Tier3
4,18084,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,16.0,0.0,20.0,Tier3


In [55]:
# Export joined_df to csv
file = "Resources/neighborhood_tier.csv"
joined_df.to_csv(file, index=False)