In [255]:
import pandas as pd
import numpy as np
import json
import os
from copy import deepcopy

# 1. Create geojson file that includes Boston + Cambridge neighborhoods with rent data

In [228]:
# Load rent data
rent_df = pd.read_csv("rent/Rent_averages_BHA.csv")
boston_zip_df = pd.read_csv("geographic/Boston_Neighborhoods_Zipcodes.csv")

# Map zip codes to neighborhoods
boston_zip2hood_d = boston_zip_df.set_index('Zip Code').to_dict()['Neighborhood']

# Add missing Boston neighborhood zip codes
missing_boston_zip_l = [(2215, "Fenway"), (2113, "North End"), (2113, "West End")]
for pair in missing_boston_zip_l:
    boston_zip2hood_d[pair[0]] = pair[1]

# Hand-currated Cambridge data
cambridge_zip_l = [('Harbor Islands', 33154), ('Mid-Cambridge', 2139), ('Riverside', 2139), ('The Port', 2139),
                   ('Cambridgeport', 2139), ('Area 2/MIT', 2139), ('Strawberry Hill', 2138), 
                   ('West Cambridge', 2138), ('North Cambridge', 2140), ('Neighborhood Nine', 2140), 
                   ('Wellington-Harrington', 2141), ('East Cambridge', 2141), ('Baldwin', 2138), 
                   ('Cambridge Highlands', 2138)]
cambridge_zip2hood_d = {}
for pair in cambridge_zip_l:
    if pair[1] not in cambridge_zip2hood_d:
        cambridge_zip2hood_d[pair[1]] = [pair[0]]
    else:
        cambridge_zip2hood_d[pair[1]].append(pair[0])

In [229]:
# Some Zip codes are associated with multiple cities, but the data is the same. Drop duplicated zip codes
# Combine city names
rent_df = rent_df.groupby('Zip').agg({**{'City': lambda x: '/'.join(x)}, 
                                      **{col: lambda x: x.iloc[0] for col in rent_df.columns[2:]}}
                                    ).reset_index()
# Canton has Zip code labeled as 0. Fix to real zip code 02021
rent_df['Zip'] = rent_df.apply(lambda x: x['Zip'] if x['City']!='Canton' else 2021, 1)

# Parse neighborhood data. Some rows have multiple neighborhoods (e.g. Dorchester / Roxbury). Explode into 2 rows
rent_df['Neighborhood'] = rent_df['City'].map(lambda x: x.split(' - ')[1].split(' / ') 
                                              if 'Boston -' in x else np.nan)
rent_df = rent_df.explode('Neighborhood')
rent_df['Neighborhood'] = rent_df['Neighborhood'].map(lambda x: x.strip() if type(x) == str else x)
# Use mapping file as well. Sometimes multiple neighborhoods in the same zip code
rent_df['Neighborhood'] = rent_df.apply(
    lambda x: x['Neighborhood'] if x['Zip'] not in boston_zip2hood_d else
        np.unique([x['Neighborhood'], boston_zip2hood_d[x['Zip']]]), 1)
rent_df = rent_df.explode('Neighborhood')

# Add Cambridge neighborhood data
rent_df['Neighborhood'] = rent_df.apply(lambda x: x['Neighborhood'] if x['Zip'] not in cambridge_zip2hood_d 
                                        else cambridge_zip2hood_d[x['Zip']], 1)
# Explode and drop nan
rent_df = rent_df.explode('Neighborhood').dropna(subset=['Neighborhood'])

# Average for each neighborhood
rent_df

Unnamed: 0,Zip,City,SRO,0 BR,1BR,2BR,3BR,4BR,5BR,6BR,7 BR,8 BR,avg_per_bed,Neighborhood
108,2108,Boston - Beacon Hill,2449,3266,3518,4179,5051,5565,6400,7235,8070,8904,1855.56746,Beacon Hill
109,2109,Boston,2615,3486,3749,4452,5387,5933,6822,7713,8602,9492,1978.352646,Downtown
109,2109,Boston,2615,3486,3749,4452,5387,5933,6822,7713,8602,9492,1978.352646,
110,2110,Boston - Financial Disctrict,2615,3486,3749,4452,5387,5933,6822,7713,8602,9492,1978.352646,Downtown
110,2110,Boston - Financial Disctrict,2615,3486,3749,4452,5387,5933,6822,7713,8602,9492,1978.352646,Financial Disctrict
111,2111,Boston - Chinatown,2272,3030,3250,3870,4680,5150,5922,6695,7467,8240,1717.716402,Chinatown
112,2113,Boston - North End,2127,2836,3046,3623,4380,4821,5543,6267,6989,7713,1608.267063,North End
112,2113,Boston - North End,2127,2836,3046,3623,4380,4821,5543,6267,6989,7713,1608.267063,West End
113,2114,Boston - West End,2490,3320,3570,4240,5130,5650,6497,7345,8192,9040,1884.039153,Beacon Hill
113,2114,Boston - West End,2490,3320,3570,4240,5130,5650,6497,7345,8192,9040,1884.039153,West End


In [227]:
rent_df

Unnamed: 0,Zip,City,SRO,0 BR,1BR,2BR,3BR,4BR,5BR,6BR,7 BR,8 BR,avg_per_bed,Neighborhood
0,2021,Canton,1875,2499,2688,3192,3864,4253,4890,5529,6166,6804,1418.345238,
1,1432,Ayer,1577,2102,2259,2686,3248,3578,4114,4650,5187,5724,1192.829630,
2,1451,Harvard,1441,1922,2079,2499,3035,3350,3852,4355,4857,5360,1106.639683,
3,1460,Littleton,1743,2324,2497,2969,3590,3955,4547,5140,5734,6327,1318.555688,
4,1701,Framingham,1577,2102,2259,2686,3248,3578,4114,4650,5187,5724,1192.829630,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,2771,Seekonk,1236,1649,1785,2163,2615,3245,3731,4218,4704,5192,996.512963,
243,2777,Swansea,1019,1360,1474,1785,2159,2674,3075,3476,3877,4278,821.956349,
244,2779,Berkley/Freetown,1063,1418,1607,2037,2531,2825,3248,3672,4095,4520,889.446296,
245,2780,Taunton,887,1184,1329,1746,2205,2360,2713,3067,3420,3775,747.245899,


geojson file format:

* Dictionary with `type`, `name`, `crs`, and `features` keys. `type`, `name`, and `crs` is not so relevant. Cambridge neighborhood geojson only contains `type` and `features`, so we will drop the others from the combined geojson file. `features` stores relevant information

* `features` is a dictionary
    * `geometry`: Dictionary with `type` (e.g. Polygon') and `coordinates` (2D list of coordinates) keys
    * `type`: e.g. `Feature`
    * `id`: e.g. 1, 2, 3, etc.
    * `properties`: Stores neighborhood-level data (e.g. name, census data, etc.) 
    
We will add rent averages and other relevant information to `properties`

We can combine the Boston and Cambridge files by just combining the features lists

In [220]:
# Load geojson files
with open("geographic/Cambridge_Neighborhoods.geojson", "r") as oFile:
    cambridge_json = json.load(oFile)
with open("geographic/Boston_Neighborhoods.geojson", "r") as oFile:
    boston_json = json.load(oFile)
# Drop keys that are not shared between the two files
boston_json.pop('crs')
boston_json.pop('name')

'Boston_Neighborhood_Boundaries_approximated_by_2020_Census_Block_Groups'

In [221]:
neighborhood_set = []
# Iterate through Boston geojson
for feature_d in boston_json['features']:
    neighborhood_set.append(feature_d['properties']['blockgr2020_ctr_neighb_name'])
# Iterate through Cambridge geojson
for feature_d in cambridge_json['features']:
    neighborhood_set.append(feature_d['properties']['NAME'])

In [222]:
[x for x in neighborhood_set if x not in set(rent_df['Neighborhood'].unique())]

['Harbor Islands']

In [224]:
# Drop Harbor Islands (nobody lives there)
boston_json['features'] = [d for d in boston_json['features'] if 
                           d['properties']['blockgr2020_ctr_neighb_name']!='Harbor Islands']

In [309]:
# Collapse rent to neighborhood level
hood2rent_df = rent_df.groupby(['Neighborhood']).agg({**{'Zip': lambda x: '/'.join(x.astype(str).unique()), 
                                                         'City': lambda x: '/'.join(x.unique())}, 
                                                      **{col: lambda x: x.iloc[0] for col in rent_df.columns[2:-1]}}
                                                    )
hood2rent_df.iloc[:, 2:-1] = hood2rent_df.iloc[:, 2:-1].astype(float)

In [310]:
# Neighborhoods to include
include_hood_set = set(hood2rent_df.index)
# Standardize keys for neighborhood in both json dictionaries
std_boston_features_l = []
std_cambridge_features_l = []
for feature_d in boston_json['features']:
    std_feature_d = deepcopy(feature_d)
    std_feature_d.pop('properties')
    properties_d = {}
    neighborhood = feature_d['properties']['blockgr2020_ctr_neighb_name']
    if neighborhood not in include_hood_set:
        continue
    properties_d['neighborhood'] = neighborhood
    rent_s = hood2rent_df.loc[neighborhood]
    for col in hood2rent_df:
        properties_d[col.strip().replace(' ', '')] = rent_s.loc[col]
    std_feature_d['properties'] = properties_d
    std_boston_features_l.append(std_feature_d)
    
for feature_d in cambridge_json['features']:
    std_feature_d = deepcopy(feature_d)
    std_feature_d.pop('properties')
    properties_d = {}
    neighborhood = feature_d['properties']['NAME']
    if neighborhood not in include_hood_set:
        continue
    properties_d['neighborhood'] = neighborhood
    rent_s = hood2rent_df.loc[neighborhood]
    for col in hood2rent_df:
        properties_d[col.strip().replace(' ', '')] = rent_s.loc[col]
    std_feature_d['properties'] = properties_d
    std_cambridge_features_l.append(std_feature_d)

In [311]:
# Combine json files
combined_features_l = std_boston_features_l + std_cambridge_features_l
combined_json = {}
combined_json['type'] = 'FeatureCollection'
combined_json['features'] = combined_features_l

In [312]:
# Write geojson file
with open("geographic/Boston_Cambridge_rent.geojson", 'w') as oFile:
    json.dump(combined_json, oFile)