In [1]:
import pandas as pd
import numpy as np

## General Notes:

- Finding the right scale at which to create the dataframe (either the city, neighborhood, zip code, or other) was an initial challenge. Notes in the headers that follow.

## CoB: Building Inventory

### Building Inv 2020 DF

[link](https://data.boston.gov/dataset/boston-buildings-inventory)

In [26]:
# SET THE MEMORY TYPE TO FALSE TO PREVENT "DTYPES" ERROR ON IMPORT
bos_buildings = pd.read_csv("../data/building_inventory_021020.csv", low_memory=False)
bos_buildings.dtypes

id                                  int64
pid_long                           object
cm_id                             float64
building_typology                  object
building_subtypology               object
                                   ...   
interior_wall_insulation_board     object
insulate_spandrel                  object
asbestos                           object
seal_elevator_vent_shafts          object
env_recommendation_count            int64
Length: 107, dtype: object

In [40]:
# =============================
# DATA DICTIONARY FOR EACH COL IN THE DF BELOW
bos_build_data_dict = pd.read_csv("../data/bos-buildings-data-dict.csv")
# bos_build_data_dict

In [41]:
bos_buildings.head(5)

Unnamed: 0,id,pid_long,cm_id,building_typology,building_subtypology,use_class,sqft_class,year_built_class,st_num,st_name,...,insulate_exposed_ducts,insulate_exposed_pipes,interior_wall_insulation_blow_in,exterior_wall_insulation_at_replacement,exterior_wall_insulation,interior_wall_insulation_board,insulate_spandrel,asbestos,seal_elevator_vent_shafts,env_recommendation_count
0,88905,,,,,,,,,,...,f,f,f,f,f,f,f,f,f,0
1,51137,1600083000.0,,Single-Family,"Single-Family, 1915-1949",Residential,"< 25,000 sqft",1915-1949,5.0,EVERETT,...,f,t,f,f,t,t,f,f,f,4
2,18045,202560000.0,,Single-Family,"Single-Family, Pre-1915",Residential,"< 25,000 sqft",Pre-1915,43.0,ESSEX,...,f,t,f,f,f,t,f,f,f,3
3,94001,2200577000.0,,Convention/Assembly,"Convention/Assembly, 1950-1979",Public & Quasi-Public,"< 25,000 sqft",1950-1979,525.0,WESTERN,...,f,f,f,f,f,f,f,f,f,0
4,92940,1600058000.0,,,,,,,,,...,f,f,f,f,f,f,f,f,f,0


In [28]:
bos_buildings.shape

(98930, 107)

In [29]:
list(bos_buildings.columns)

['id',
 'pid_long',
 'cm_id',
 'building_typology',
 'building_subtypology',
 'use_class',
 'sqft_class',
 'year_built_class',
 'st_num',
 'st_name',
 'st_name_suf',
 'unit_num',
 'zipcode',
 'yr_built',
 'land_sf',
 'gross_area',
 'living_area',
 'sqft',
 'num_floors',
 'num_bldgs',
 'units_res',
 'units_com',
 'units_mixed',
 'ptype',
 'ptype_list',
 'lu',
 'lu_list',
 'assessor_category',
 'assessor_description',
 'own_occ',
 'own_occ_list',
 'owner_list',
 'structure_class',
 'structure_class_list',
 'bldg_styl',
 'bldg_styl_list',
 'r_roof_typ',
 'ext_fin',
 'ext_fin_list',
 'ext_cond_list',
 'insulation_type',
 'foundation_type',
 'heat_typ',
 'heat_typ_list',
 'heating_system_fuel',
 'ac',
 'ac_list',
 'ac_system_type',
 'hot_water_system_fuel',
 'electric_panel_info',
 'has_pv',
 'existing_ashp',
 'existing_gshp',
 'existing_shw',
 'historic_district',
 'landmark',
 'flood',
 'stormwater',
 'last_major_renovation_date',
 'total_site_energy_kbtu',
 'perc_electricity',
 'perc_gas

In [32]:
sample_bos_buildings = bos_buildings[["building_typology", "use_class", "st_num", "st_name", "zipcode"]]

In [36]:
sample_bos_buildings.loc[sample_bos_buildings["st_name"] == "BOULEVARD"]

Unnamed: 0,building_typology,use_class,st_num,st_name,zipcode
5365,Multi-Family (2 units),Residential,29 31,BOULEVARD,2135.0
5612,Multi-Family (2 units),Residential,2 4,BOULEVARD,2134.0
5745,Multi-Family (2 units),Residential,30 32,BOULEVARD,2134.0
5858,Multi-Family (3 units),Residential,39 41,BOULEVARD,2134.0
6278,Multi-Family (3 units),Residential,43 45,BOULEVARD,2135.0
8825,Multi-Family (2 units),Residential,19 21,BOULEVARD,2135.0
8995,Multi-Family (2 units),Residential,10 12,BOULEVARD,2134.0
9096,Multi-Family (2 units),Residential,26 28,BOULEVARD,2134.0
9372,Multi-Family (2 units),Residential,33 35,BOULEVARD,2135.0
9373,Multi-Family (2 units),Residential,18 20,BOULEVARD,2134.0


### Bos Property Assessment 2020

[link](https://data.boston.gov/dataset/property-assessment/resource/fd351943-c2c6-4630-992d-3f895360febd)

In [50]:
bos_prop_assess = pd.read_csv("../data/data2020-full.txt", low_memory=False)
bos_prop_assess.head()

Unnamed: 0,PID,CM_ID,GIS_ID,ST_NUM,ST_NAME,ST_NAME_SUF,UNIT_NUM,ZIPCODE,PTYPE,LU,...,U_BTH_STYLE2,U_BTH_STYLE3,U_KITCH_TYPE,U_KITCH_STYLE,U_HEAT_TYP,U_AC,U_FPLACE,U_INT_FIN,U_INT_CND,U_VIEW
0,100271002,100271000.0,100271000,356.0,PRINCETON,ST,1.0,2128.0,102,CD,...,,,F - Full Eat In,M - Modern,W - Ht Water/Steam,N - None,0.0,N - Normal,G - Good,A - Average
1,100271004,100271000.0,100271000,356.0,PRINCETON,ST,2.0,2128.0,102,CD,...,,,F - Full Eat In,M - Modern,W - Ht Water/Steam,N - None,0.0,N - Normal,G - Good,A - Average
2,100271006,100271000.0,100271000,356.0,PRINCETON,ST,3.0,2128.0,102,CD,...,,,F - Full Eat In,M - Modern,W - Ht Water/Steam,N - None,0.0,N - Normal,G - Good,A - Average
3,100272000,,100272000,352.0,PRINCETON,ST,,2128.0,105,R3,...,,,,,,,,,,
4,100273000,,100273000,,PRINCETON,ST,,2128.0,132,RL,...,,,,,,,,,,


In [51]:
bos_prop_assess.shape

(175052, 75)

## Zoning Subdistricts

[link](https://bostonopendata-boston.opendata.arcgis.com/datasets/zoning-subdistricts)

In [2]:
zone_subdist = pd.read_csv("../data/Zoning_Subdistricts.csv")

**Notes**

- Ok, this is by sub-district
    - **Could DISTRICT be used as a different approach, than zipcode?**
- Contains some NaN's
- Has 37 unique districts, which is == to the zum of zipcodes
    - But, there are multiple rows containing the same district.
    - So, the **ZONE_** is a **Nominal** ID of a zone type within a (or any) district
- **Unique_Code**?
    - Has a count of 540.....
- SUBDISTRICT
    - Exclude industrial?

### General Info:

In [7]:
zone_subdist.head(30)

Unnamed: 0,OBJECTID,ZONE_,DISTRICT,MAPNO,ARTICLE,SUBDISTRIC,Unique_Code,FAR,Shape_STArea__,Shape_STLength__,Zone_Desc
0,62441,CC,Mission Hill Neighborhood,6D,59,Business,Mission Hill Neighborhood CC,3.0,0,0,Community Commercial
1,62442,WM,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood WM,2.0,0,0,Waterfront Manufacturing
2,62443,M-4,South Boston,4,Underlying Zoning,Industrial,South Boston M-4,4.0,0,0,Restricted Manufacturing
3,62444,D St. NDA,South Boston Neighborhood,4F,68,Mixed Use,South Boston Neighborhood D St. NDA,2.0,0,0,Neighborhood Development Area
4,62445,SUMMER ST. LI,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood Summer St. LI,3.0,0,0,Local Industrial
5,62446,E St. LI,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood E St. LI,2.0,0,0,Local Industrial
6,62447,WC,South Boston Neighborhood,4F,68,Business,South Boston Neighborhood WC,2.0,0,0,Waterfront Commercial
7,62448,I-2,South Boston,4,Underlying Zoning,Industrial,South Boston I-2,2.0,0,0,General Industrial
8,62449,First St. LI,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood First St. LI,2.0,0,0,Local Industrial
9,62450,EDA CENTRAL,South End Neighborhood,1P,64,Miscellaneous,South End Neighborhood EDA CENTRAL,4.0,0,0,Economic Development Area


In [46]:
zone_subdist.tail()

Unnamed: 0,OBJECTID,ZONE_,DISTRICT,MAPNO,ARTICLE,SUBDISTRIC,Unique_Code,FAR,Shape_STArea__,Shape_STLength__,Zone_Desc
1644,64085,NS,Jamaica Plain Neighborhood,9A-9C,55,Business,Jamaica Plain Neighborhood NS,2.0,0,0,Neighborhood Shopping
1645,64086,NS,Jamaica Plain Neighborhood,9A-9C,55,Business,Jamaica Plain Neighborhood NS,2.0,0,0,Neighborhood Shopping
1646,64087,OS-CM,Jamaica Plain Neighborhood,9A-9C,55,Open Space,Jamaica Plain Neighborhood OS-CM,,0,0,Cemetery Open Space
1647,64088,OS-BZ,Jamaica Plain Neighborhood,9A-9C,55,Open Space,Jamaica Plain Neighborhood OS-BZ,,0,0,Botanical/Zoological Garden Open Space
1648,64089,NS,Dorchester Neighborhood,5E,65,Business,Dorchester Neighborhood NS,1.0,0,0,Neighborhood Shopping


In [17]:
zone_subdist.shape

(1649, 11)

In [19]:
zone_subdist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1649 entries, 0 to 1648
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   OBJECTID          1649 non-null   int64  
 1   ZONE_             1648 non-null   object 
 2   DISTRICT          1649 non-null   object 
 3   MAPNO             1649 non-null   object 
 4   ARTICLE           1649 non-null   object 
 5   SUBDISTRIC        1649 non-null   object 
 6   Unique_Code       1649 non-null   object 
 7   FAR               1195 non-null   float64
 8   Shape_STArea__    1649 non-null   int64  
 9   Shape_STLength__  1649 non-null   int64  
 10  Zone_Desc         1649 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 141.8+ KB


### Columns Specific:

In [6]:
zone_subdist.head()

Unnamed: 0,OBJECTID,ZONE_,DISTRICT,MAPNO,ARTICLE,SUBDISTRIC,Unique_Code,FAR,Shape_STArea__,Shape_STLength__,Zone_Desc
0,62441,CC,Mission Hill Neighborhood,6D,59,Business,Mission Hill Neighborhood CC,3.0,0,0,Community Commercial
1,62442,WM,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood WM,2.0,0,0,Waterfront Manufacturing
2,62443,M-4,South Boston,4,Underlying Zoning,Industrial,South Boston M-4,4.0,0,0,Restricted Manufacturing
3,62444,D St. NDA,South Boston Neighborhood,4F,68,Mixed Use,South Boston Neighborhood D St. NDA,2.0,0,0,Neighborhood Development Area
4,62445,SUMMER ST. LI,South Boston Neighborhood,4F,68,Industrial,South Boston Neighborhood Summer St. LI,3.0,0,0,Local Industrial


In [20]:
# HOW MANY ZONES ARE THERE?
number_zones = zone_subdist["ZONE_"].nunique()

# HOW MANY DISTRICTS?
number_dists = zone_subdist["DISTRICT"].nunique()

# HOW MANY SUBDISTRICTS?
number_subdists = zone_subdist["SUBDISTRIC"].nunique()

# HOW MANY UNIQUE CODES?
number_unique_codes = zone_subdist["Unique_Code"].nunique()

# HOW MANY ZONE DESCRIPTIONS?
number_zone_desc = zone_subdist["Zone_Desc"].nunique()


print(f"Number of Zones: {number_zones}")
print(f"Number of Subdistricts: {number_subdists}")
print(f"Number of Unique Codes: {number_unique_codes}")
print(f"Number of Districts: {number_dists}")
print(f"Number of Zone Descriptions: {number_zone_desc}")

Number of Zones: 304
Number of Subdistricts: 7
Number of Unique Codes: 540
Number of Districts: 37
Number of Zone Descriptions: 69


In [3]:
boston_districts = set(zone_subdist["DISTRICT"])
print(len(boston_districts))
boston_districts

37


{'Allston/Brighton Neighborhood',
 'Audbon Circle Neighborhood',
 'Bay Village Neighborhood',
 'Boston Harbor',
 'Boston Proper',
 'Bulfinch Triangle',
 'Cambridge Street North',
 'Central Artery Special',
 'Charlestown Neighborhood',
 'Chinatown',
 'City Square Neighborhood',
 'Dorchester Neighborhood',
 'East Boston Neighborhood',
 'Fenway Neighborhood',
 'Government Center/Markets',
 'Greater Mattapan Neighborhood',
 'Harborpark: Charlestown Waterfront',
 'Harborpark: Dorchester Bay/Neponset River Waterfront',
 'Harborpark: Fort Point Waterfront',
 'Harborpark: North End Waterfront',
 'Huntington Avenue/Prudential Center',
 'Hyde Park Neighborhood',
 'Jamaica Plain Neighborhood',
 'Leather District',
 'Midtown Cultural',
 'Mission Hill Neighborhood',
 'Newmarket Industrial Commercial Neighborhood District',
 'North End Neighborhood',
 'North Station Economic Development Area',
 'Roslindale Neighborhood',
 'Roxbury Neighborhood',
 'South Boston',
 'South Boston Neighborhood',
 'South

In [61]:
zips = list(np.random.randint(0, 1, 37))
zips_dict = dict(zip(boston_districts, zips))
zips_dict

{'Roslindale Neighborhood': 0,
 'Dorchester Neighborhood': 0,
 'Greater Mattapan Neighborhood': 0,
 'Boston Proper': 0,
 'City Square Neighborhood': 0,
 'Charlestown Neighborhood': 0,
 'Midtown Cultural': 0,
 'Newmarket Industrial Commercial Neighborhood District': 0,
 'Bulfinch Triangle': 0,
 'Harborpark: Fort Point Waterfront': 0,
 'Huntington Avenue/Prudential Center': 0,
 'Harborpark: Dorchester Bay/Neponset River Waterfront': 0,
 'Mission Hill Neighborhood': 0,
 'South End Neighborhood': 0,
 'Cambridge Street North': 0,
 'Leather District': 0,
 'Fenway Neighborhood': 0,
 'Allston/Brighton Neighborhood': 0,
 'Harborpark: North End Waterfront': 0,
 'North Station Economic Development Area': 0,
 'Harborpark: Charlestown Waterfront': 0,
 'Government Center/Markets': 0,
 'Roxbury Neighborhood': 0,
 'Central Artery Special': 0,
 'Jamaica Plain Neighborhood': 0,
 'South Station Economic Development Area': 0,
 'West Roxbury Neighborhood': 0,
 'Audbon Circle Neighborhood': 0,
 'Stuart Stre

## Zoning Districts

[link](https://bostonopendata-boston.opendata.arcgis.com/datasets/zoning-districts?geometry=-71.494%2C42.223%2C-70.570%2C42.401)

**Notes**



In [66]:
zone_districts = pd.read_csv("../data/Zoning_Districts.csv")
print(zone_districts.shape)
zone_districts.head()

(36, 8)


Unnamed: 0,OBJECTID_12,DISTRICT,STAGE,SHAPE_STArea__,SHAPE_STLength__,MAPNO,ARTICLE,VOLUME
0,474,Charlestown Neighborhood,Adopted,20316070.0,25248.354293,2E,62,Volume III
1,475,Leather District,Adopted,681271.7,3237.140284,1C/1G/1N,44,Volume II
2,476,Harborpark: North End Waterfront,Adopted,10473950.0,18508.987912,1E-1F,42A,Volume II
3,477,Cambridge Street North,Adopted,798468.3,4908.021397,1B/1J/1K/1L,47A,Volume II
4,478,Harborpark: Charlestown Waterfront,Adopted,28737510.0,38889.395007,2B/2C,42B,Volume II


In [67]:
set(zone_districts["DISTRICT"])

{'Allston/Brighton Neighborhood',
 'Audbon Circle Neighborhood',
 'Bay Village Neighborhood',
 'Boston Proper',
 'Bulfinch Triangle',
 'Cambridge Street North',
 'Central Artery Special',
 'Charlestown Neighborhood',
 'Chinatown',
 'City Square Neighborhood',
 'Dorchester Neighborhood',
 'East Boston Neighborhood',
 'Fenway Neighborhood',
 'Government Center/Markets',
 'Greater Mattapan Neighborhood',
 'Harborpark: Charlestown Waterfront',
 'Harborpark: Dorchester Bay/Neponset River Waterfront',
 'Harborpark: Fort Point Waterfront',
 'Harborpark: North End Waterfront',
 'Huntington Avenue/Prudential Center',
 'Hyde Park Neighborhood',
 'Jamaica Plain Neighborhood',
 'Leather District',
 'Midtown Cultural',
 'Mission Hill Neighborhood',
 'Newmarket Industrial Commercial Neighborhood District',
 'North End Neighborhood',
 'North Station Economic Development Area',
 'Roslindale Neighborhood',
 'Roxbury Neighborhood',
 'South Boston',
 'South Boston Neighborhood',
 'South End Neighborhood'

## Planning Districts

[link](https://bostonopendata-boston.opendata.arcgis.com/datasets/planning-districts?geometry=-71.519%2C42.223%2C-70.595%2C42.401&page=2)

**Notes**



In [25]:
planning_districts = pd.read_csv("../data/Planning_Districts.csv")
print(planning_districts.shape)
planning_districts.head()

(17, 8)


Unnamed: 0,OBJECTID_1,OBJECTID,PLANNING_D,ID,PD,bragis92BRAEDITORBos_Districts_PlanningAREA,SHAPESTArea,SHAPESTLength
0,1,1,1,1,East Boston,132161900.0,132161900.0,127431.24166
1,2,2,2,2,Charlestown,38513730.0,38513730.0,56156.269664
2,3,3,3,3,South Boston,85759850.0,85759850.0,89461.981964
3,4,4,4,4,Central,35972210.0,35972210.0,51791.018425
4,5,5,5,5,Back Bay/Beacon Hill,25420700.0,25420700.0,39404.751559


In [26]:
# HOW MANY DIFFERENT PD?
planning_districts["PD"].nunique()

17

## "Codes": Zipcodes

[link](http://bostonopendata-boston.opendata.arcgis.com/datasets/53ea466a189b4f43b3dfb7b38fa7f3b6_1?geometry=-71.534%2C42.226%2C-70.610%2C42.404&selectedAttribute=Shape.STArea())

1) Boston covers 37 zip codes:
    - 02108 02109 02110 02111 02112 02117 02118 02127 02113 02114 
    - 02115 02116 02123 02128 02133 02163 02196 02199 02205 02206 
    - 02212 02215 02266 02283 02201 02203 02204 02210 02211 02217 
    - 02222 02241 02284 02293 02295 02297 02298 

**From Boston Open Data** ArcGIS OpenData

Not very meaningful, is mostly for providing geo-info

In [2]:
zip_codes = pd.read_csv("../data/ZIP_Codes.csv")

In [4]:
zip_codes.head() # THE ZIP5 COL HAS CUT OFF THE '0' IN THE ZIP CODE

Unnamed: 0,OBJECTID,ZIP5,ShapeSTArea,ShapeSTLength
0,1,2134,37219360.0,40794.182396
1,2,2125,64760520.0,62224.52144
2,3,2110,6637284.0,18358.213496
3,4,2118,31161580.0,32353.407618
4,5,2126,60785850.0,45488.394711


In [7]:
zip_codes["ZIP5"].nunique() # VERIFIES THE COUNT ABOVE

37

## Boston Neighborhoods

[link](https://data.boston.gov/dataset/boston-neighborhoods)

In [8]:
# READING IN THE BOSTON NEIGHBORHOODS CSV FILE
    # COMBO OF ZONING BOUNDARIES, ZIP CODES, AND 2010 CENSUS TRACT BOUNDS
    # ***NOT OFFICIAL BOUNDS FOR THE CITY OF BOSTON***
        # These boundaries are used in the broad sense for visualization purposes 
        # for zoning and planning studies.
nb_boston = pd.read_csv("../data/Boston_Neighborhoods.csv")

**Notes**

- This is a small df, with only 26 rows and 7 columns.
- Does not contain much 'gleanable' info:
    - No zipcodes, no population, no demographics data.

In [9]:
nb_boston.head()

Unnamed: 0,OBJECTID,Name,Acres,Neighborhood_ID,SqMiles,ShapeSTArea,ShapeSTLength
0,27,Roslindale,1605.568237,15,2.51,69938270.0,53563.912597
1,28,Jamaica Plain,2519.245394,11,3.94,109737900.0,56349.937161
2,29,Mission Hill,350.853564,13,0.55,15283120.0,17918.724113
3,30,Longwood,188.611947,28,0.29,8215904.0,11908.757148
4,31,Bay Village,26.539839,33,0.04,1156071.0,4650.635493


In [8]:
nb_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   OBJECTID         26 non-null     int64  
 1   Name             26 non-null     object 
 2   Acres            26 non-null     float64
 3   Neighborhood_ID  26 non-null     int64  
 4   SqMiles          26 non-null     float64
 5   ShapeSTArea      26 non-null     float64
 6   ShapeSTLength    26 non-null     float64
dtypes: float64(4), int64(2), object(1)
memory usage: 1.5+ KB


In [6]:
nb_boston.shape

(26, 7)

In [7]:
nb_boston.isna().sum()

OBJECTID           0
Name               0
Acres              0
Neighborhood_ID    0
SqMiles            0
ShapeSTArea        0
ShapeSTLength      0
dtype: int64

In [14]:
nb_boston["Name"].count()

26

In [10]:
# ARE ALL THE NB'S UNIQUE?
nb_boston["Name"].value_counts()

North End                  1
Jamaica Plain              1
Dorchester                 1
Roslindale                 1
Charlestown                1
Mattapan                   1
Downtown                   1
West End                   1
Chinatown                  1
Bay Village                1
Mission Hill               1
South Boston Waterfront    1
Back Bay                   1
East Boston                1
Allston                    1
Beacon Hill                1
Hyde Park                  1
Longwood                   1
South End                  1
Brighton                   1
Roxbury                    1
West Roxbury               1
Fenway                     1
South Boston               1
Harbor Islands             1
Leather District           1
Name: Name, dtype: int64

In [11]:
nb_boston.loc[nb_boston["Name"] == "Dorchester"]

Unnamed: 0,OBJECTID,Name,Acres,Neighborhood_ID,SqMiles,ShapeSTArea,ShapeSTLength
21,48,Dorchester,4662.879457,6,7.29,203114200.0,104344.034005


## Non-Identifiers

### Census: Poverty Threholds

[link](https://www.census.gov/data/tables/2019/demo/income-poverty/p60-266.html)

In [45]:
census_pov_thresh = pd.read_excel("../data/thresh18.xls")
census_pov_thresh

Unnamed: 0,Table with row headings in column A and column headings in rows 4 to 8.,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Poverty Thresholds for 2018 by Size of Family ...,,,,,,,,,,
1,,,,,,,,,,,
2,Size of family unit,,Related children under 18 years,,,,,,,,
3,,Weighted,,,,,,,,,
4,,average,,One,Two,Three,Four,Five,Six,Seven,Eight or more
5,,thresholds,,,,,,,,,
6,,,,,,,,,,,
7,,,,,,,,,,,
8,One person (unrelated individual):,12784,,,,,,,,,
9,Under age 65,13064,13064,,,,,,,,


### Kaggle: Bos Housing Data

In [22]:
kaggle_bos = pd.read_csv("../data/boston-housing.csv")
kaggle_bos.head()

Unnamed: 0,0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00
0,0.02731 0.00 7.070 0 0.4690 6.4210 78...
1,0.02729 0.00 7.070 0 0.4690 7.1850 61...
2,0.03237 0.00 2.180 0 0.4580 6.9980 45...
3,0.06905 0.00 2.180 0 0.4580 7.1470 54...
4,0.02985 0.00 2.180 0 0.4580 6.4300 58...


### Fed Reserve: Unemp

[link](https://fred.stlouisfed.org/series/BOST625URN)

In [20]:
fred_unemp = pd.read_csv("../data/BOST625URN.csv")
fred_unemp.head()

Unnamed: 0,DATE,BOST625URN
0,1990-01-01,5.1
1,1990-02-01,5.2
2,1990-03-01,5.3
3,1990-04-01,5.2
4,1990-05-01,5.4


### Fed Reserve: All Transactions House Price Index

In [18]:
fred_house_price = pd.read_csv("../data/ATNHPIUS14454Q.csv")
fred_house_price.head()

Unnamed: 0,DATE,ATNHPIUS14454Q
0,1977-10-01,24.97
1,1978-01-01,25.02
2,1978-04-01,26.64
3,1978-07-01,30.07
4,1978-10-01,29.94


### Zillow: Rentals by Metro & US 

In [16]:
zillow_rentals = pd.read_csv("../data/Metro_ZORI_AllHomesPlusMultifamily_Smoothed.csv")
zillow_rentals.head()

Unnamed: 0,RegionID,RegionName,SizeRank,2014-01,2014-02,2014-03,2014-04,2014-05,2014-06,2014-07,...,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03,2020-04
0,102001,United States,0,1254.0,1262,1265,1279,1293,1304,1313,...,1576,1577,1575,1572,1570,1568,1574,1583,1592,1594
1,394913,"New York, NY",1,2150.0,2182,2180,2206,2232,2255,2268,...,2576,2575,2569,2572,2559,2549,2549,2569,2547,2549
2,753899,"Los Angeles-Long Beach-Anaheim, CA",2,1820.0,1815,1828,1855,1880,1908,1932,...,2485,2491,2490,2489,2489,2488,2497,2506,2516,2503
3,394463,"Chicago, IL",3,1419.0,1432,1435,1453,1471,1483,1492,...,1686,1685,1676,1664,1655,1649,1655,1669,1680,1682
4,394514,"Dallas-Fort Worth, TX",4,1201.0,1199,1205,1217,1231,1244,1249,...,1528,1528,1523,1524,1520,1515,1519,1528,1538,1535


### Zillow: Home Values by Neighborhood

In [4]:
zillow_nb = pd.read_csv("../data/Neighborhood_Zhvi_AllHomes.csv")
zillow_nb

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1996-01-31,...,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30
0,274772,0,Northeast Dallas,Neighborhood,TX,TX,Dallas,Dallas-Fort Worth-Arlington,Dallas County,134197.0,...,328763.0,329075.0,330792.0,331337.0,332285.0,331414.0,330948.0,330181.0,329760.0,330354.0
1,112345,1,Maryvale,Neighborhood,AZ,AZ,Phoenix,Phoenix-Mesa-Scottsdale,Maricopa County,,...,184177.0,185737.0,187041.0,188210.0,189345.0,190887.0,193014.0,195117.0,197585.0,200393.0
2,192689,2,Paradise,Neighborhood,NV,NV,Las Vegas,Las Vegas-Henderson-Paradise,Clark County,140042.0,...,268101.0,267143.0,266860.0,267374.0,268508.0,269481.0,270883.0,272056.0,274457.0,276646.0
3,270958,3,Upper West Side,Neighborhood,NY,NY,New York,New York-Newark-Jersey City,New York County,247356.0,...,1245653.0,1231122.0,1218079.0,1211863.0,1214994.0,1223361.0,1223522.0,1218477.0,1207736.0,1208018.0
4,118208,4,South Los Angeles,Neighborhood,CA,CA,Los Angeles,Los Angeles-Long Beach-Anaheim,Los Angeles County,135240.0,...,509245.0,511662.0,514662.0,518097.0,521237.0,525139.0,529323.0,534713.0,540251.0,545001.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16142,107413,17008,Weatherly Heights,Neighborhood,AL,AL,Huntsville,Huntsville,Madison County,,...,197230.0,197816.0,198844.0,199879.0,201312.0,203808.0,206727.0,210711.0,214471.0,218473.0
16143,117927,17008,Santa Rita,Neighborhood,CA,CA,Salinas,Salinas,Monterey County,,...,429375.0,430777.0,430652.0,430944.0,431372.0,432134.0,434540.0,435296.0,436818.0,436868.0
16144,122375,17008,Green Valley,Neighborhood,DE,DE,Newark,Philadelphia-Camden-Wilmington,New Castle County,84278.0,...,206160.0,206546.0,206738.0,206565.0,206923.0,207921.0,208717.0,209470.0,209763.0,210848.0
16145,107518,17008,Whitesburg Estates,Neighborhood,AL,AL,Huntsville,Huntsville,Madison County,,...,212496.0,213294.0,214132.0,214923.0,215820.0,217251.0,219044.0,222129.0,225813.0,229489.0


In [11]:
zillow_boston = zillow_nb.loc[zillow_nb["City"] == "Boston"]
zillow_boston.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1996-01-31,...,2019-07-31,2019-08-31,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30
105,275424,108,South Dorchester,Neighborhood,MA,MA,Boston,Boston-Cambridge-Newton,Suffolk County,95256.0,...,517997.0,518236.0,519207.0,520479.0,521065.0,521382.0,523552.0,527318.0,531277.0,532499.0
144,54232,148,Roxbury,Neighborhood,MA,MA,Boston,Boston-Cambridge-Newton,Suffolk County,84257.0,...,481126.0,481201.0,481360.0,483578.0,485154.0,487124.0,489969.0,494370.0,498139.0,501643.0
209,154795,214,Jamaica Plain,Neighborhood,MA,MA,Boston,Boston-Cambridge-Newton,Suffolk County,127436.0,...,643710.0,643359.0,643905.0,644961.0,644675.0,644855.0,646482.0,650255.0,653018.0,653160.0
217,154399,222,East Boston,Neighborhood,MA,MA,Boston,Boston-Cambridge-Newton,Suffolk County,74513.0,...,528068.0,527799.0,527828.0,528229.0,527209.0,526372.0,526952.0,529028.0,530139.0,527698.0
230,37575,235,Brighton,Neighborhood,MA,MA,Boston,Boston-Cambridge-Newton,Suffolk County,135788.0,...,542368.0,541797.0,541719.0,542438.0,542323.0,542861.0,543797.0,546377.0,548034.0,548527.0


In [14]:
zillow_boston["RegionName"].value_counts() # THERE ARE 26 TOTAL

Leather District     1
South Boston         1
Downtown Crossing    1
Kenmore              1
Hyde Park            1
Fenway               1
South Dorchester     1
North End            1
West End             1
Mission Hill         1
West Roxbury         1
Roslindale           1
Roxbury              1
Allston              1
Mattapan             1
Brighton             1
South End            1
Chinatown            1
Bay Village          1
Back Bay             1
Downtown             1
Charlestown          1
Beacon Hill          1
East Boston          1
North Dorchester     1
Jamaica Plain        1
Name: RegionName, dtype: int64

### Sidewalks

[link](https://data.boston.gov/dataset/sidewalk-inventory)

In [28]:
sidewalks = pd.read_csv("../data/Sidewalk_Inventory.csv")
sidewalks.head(10)

Unnamed: 0,OBJECTID,SWK_ID,new_insp_d,INSP,MATERIAL,SWK_WIDTH,SWK_SLOPE,DAM_LENGTH,DAM_WIDTH,SCI,...,PARENT,Snow_Route,SEG_ID,SIDE,ROUTE,inspected,Shape_STArea__,Shape_STLength__,ShapeSTArea,ShapeSTLength
0,1,1,2013/12/26 00:00:00+00,Tan Pham,OT,7.0,2.2,15,5.5,97.7,...,ALBIO1,1A-3-1,ALBIO1_0,LEFT,,yes,3648.35791,1448.226939,3648.361328,1448.226939
1,2,2,2013/12/26 00:00:00+00,Derek Chan,CC,6.0,2.5,101,6.0,47.1,...,AUBUR1,1A-3-1,AUBUR1_0,LEFT,,yes,1145.14209,464.846589,1145.141602,464.846589
2,3,3,2013/12/26 00:00:00+00,Derek Chan,CC,6.0,3.0,97,6.0,63.7,...,AUBUR1,1A-3-1,AUBUR1_0,RIGHT,,yes,1603.231201,529.743387,1603.230713,529.743387
3,4,4,2013/12/26 00:00:00+00,Derek Chan,CC,6.0,5.0,30,6.0,56.2,...,AUBUR1,1A-3-1,AUBUR1_612,RIGHT,,yes,411.096191,145.202968,411.096436,145.202968
4,5,5,2013/12/26 00:00:00+00,Derek Chan,CC,5.5,1.8,25,5.5,86.7,...,ALBIO1,1A-3-1,ALBIO1_0,RIGHT,,yes,1031.328369,418.812686,1031.329102,418.812686
5,6,6,2013/12/17 00:00:00+00,Derek Chan,CC,6.5,3.6,68,6.5,95.0,...,ALFOR1,1A-1-2,ALFOR1_2700,RIGHT,,yes,8797.416748,2739.046477,8797.41748,2739.046476
6,7,7,2013/12/26 00:00:00+00,Derek Chan,CC,6.5,1.8,152,6.5,70.4,...,BALDW1,1A-3-1,BALDW1_208,RIGHT,,yes,3337.608643,993.030589,3337.608154,993.030589
7,8,8,2013/12/26 00:00:00+00,Derek Chan,CC,7.0,3.7,86,7.0,81.7,...,BALDW1,1A-3-1,BALDW1_208,LEFT,,yes,3280.870117,1030.425319,3280.87207,1030.425319
8,9,9,2014/12/26 00:00:00+00,Mike Haggerty,BC,5.0,4.7,1400,5.0,58.1,...,BEACO5,4-4-3,BEACO5_649,LEFT,,yes,16703.152588,5195.124707,16703.154541,5195.124707
9,10,10,2014/12/18 00:00:00+00,Serge Lindor,CC,9.5,0.7,5,9.5,95.7,...,CHEST6,4-3-2,CHEST6_2365,RIGHT,,yes,1111.786865,240.025509,1111.786865,240.025509


### Boston City Score

[link](https://data.boston.gov/dataset/cityscore)

In [16]:
# CSV FILE
# "METRICS ON OVERALL CITY HEALTH BASED ON WORK DONE ACROSS ALL FACETS OF THE CITY"

city_score_boston = pd.read_csv("../data/tmpnbpkibsv.csv")

In [17]:
city_score_boston.head()

Unnamed: 0,metric_name,score_calculated_ts,target,metric_logic,day_score,day_numerator,day_denominator,week_score,week_numerator,week_denominator,month_score,month_numerator,month_denominator,quarter_score,quarter_numerator,quarter_denominator,latest_score_flag
0,BFD INCIDENTS,2020-05-18 06:05:17.022092,,historical_average / current_average,1.480498,225.035714,152.0,1.396498,225.035714,161.142857,1.369727,205.641667,150.133333,1.297955,215.777778,166.244444,1
1,BFD INCIDENTS,2020-05-17 06:01:26.914554,,historical_average / current_average,1.152266,214.321429,186.0,1.376376,214.321429,155.714286,1.369727,205.641667,150.133333,1.297955,215.777778,166.244444,0
2,BFD INCIDENTS,2020-05-16 06:01:35.813394,,historical_average / current_average,1.060997,214.321429,202.0,1.376376,214.321429,155.714286,1.369727,205.641667,150.133333,1.297955,215.777778,166.244444,0
3,LIBRARY USERS,2020-05-15 06:04:23.127541,,current_average / historical_average,1.129332,8669.0,7676.222222,1.154977,8865.857143,7676.222222,1.134916,8700.1,7665.854701,1.202469,9533.411111,7928.19888,0
4,LIBRARY USERS,2020-05-14 14:17:41.861023,,current_average / historical_average,1.211924,9303.0,7676.222222,1.154977,8865.857143,7676.222222,1.134916,8700.1,7665.854701,1.202469,9533.411111,7928.19888,0


In [18]:
city_score_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20202 entries, 0 to 20201
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   metric_name          20202 non-null  object 
 1   score_calculated_ts  20202 non-null  object 
 2   target               15118 non-null  float64
 3   metric_logic         20202 non-null  object 
 4   day_score            13825 non-null  float64
 5   day_numerator        16047 non-null  float64
 6   day_denominator      14079 non-null  float64
 7   week_score           18402 non-null  float64
 8   week_numerator       19370 non-null  float64
 9   week_denominator     18604 non-null  float64
 10  month_score          19772 non-null  float64
 11  month_numerator      20019 non-null  float64
 12  month_denominator    19874 non-null  float64
 13  quarter_score        19453 non-null  float64
 14  quarter_numerator    19626 non-null  float64
 15  quarter_denominator  20015 non-null 

In [21]:
city_score_boston.shape

(20202, 17)

In [22]:
# CONTAINS MORE NAN
city_score_boston.isna().sum()

metric_name               0
score_calculated_ts       0
target                 5084
metric_logic              0
day_score              6377
day_numerator          4155
day_denominator        6123
week_score             1800
week_numerator          832
week_denominator       1598
month_score             430
month_numerator         183
month_denominator       328
quarter_score           749
quarter_numerator       576
quarter_denominator     187
latest_score_flag         0
dtype: int64

In [24]:
# WHAT IS THE TARGET COLUMN?
city_score_boston["target"].value_counts()

0.80    9230
4.00    1869
0.95    1868
0.90     934
6.00     806
1.00     411
Name: target, dtype: int64

### Economic Indicators

[csv link](https://data.boston.gov/dataset/economic-indicators-legacy-portal)

In [33]:
# HAS FORECLOSURE INFO
    # TOURISM
    # UNEMPLOYMENT
        # TOTAL JOBS (WITHIN THE AREA?)
        # RATES
    # HOUSING SALE PRICES
        # SALES VOLUME
        # AFFORDABLE HOUSING UNIT PERMITS
econ_inds_boston = pd.read_csv("../data/economic-indicators.csv")

**Notes**

- What are the rows?
    - Possibly just different days
        - Sometimes within the same month, often within the same year.
- This is city wide
    - Hard to see how this would have import to a more granular approach to either
        neighborhoods or zip codes 

In [28]:
econ_inds_boston.head()

Unnamed: 0,Year,Month,logan_passengers,logan_intl_flights,hotel_occup_rate,hotel_avg_daily_rate,total_jobs,unemp_rate,labor_force_part_rate,pipeline_unit,pipeline_total_dev_cost,pipeline_sqft,pipeline_const_jobs,foreclosure_pet,foreclosure_deeds,med_housing_price,housing_sales_vol,new_housing_const_permits,new-affordable_housing_permits
0,2013,1,2019662,2986,0.572,158.925868,0,0.071,0.594,329,80000000,313107,241.6,44,11,380000,405,534,134
1,2013,2,1878731,2587,0.645,165.497062,0,0.063,0.591,557,360700000,1001911,1089.314,50,10,359500,332,196,13
2,2013,3,2469155,3250,0.819,187.45,0,0.061,0.591,803,343200000,820556,1036.464,6,6,400000,479,12,0
3,2013,4,2551246,3408,0.855,233.435374,0,0.061,0.593,325,150000000,350000,453.0,14,8,403000,558,91,24
4,2013,5,2676291,3240,0.858,237.89021,0,0.069,0.597,122,222200000,457374,671.044,12,4,425500,809,63,0


In [29]:
econ_inds_boston.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Year                            25 non-null     int64  
 1   Month                           25 non-null     int64  
 2   logan_passengers                25 non-null     int64  
 3   logan_intl_flights              25 non-null     int64  
 4   hotel_occup_rate                25 non-null     float64
 5   hotel_avg_daily_rate            25 non-null     float64
 6   total_jobs                      25 non-null     int64  
 7   unemp_rate                      25 non-null     float64
 8   labor_force_part_rate           25 non-null     float64
 9   pipeline_unit                   25 non-null     int64  
 10  pipeline_total_dev_cost         25 non-null     int64  
 11  pipeline_sqft                   25 non-null     int64  
 12  pipeline_const_jobs             25 non

In [31]:
econ_inds_boston.shape

(25, 19)

In [32]:
econ_inds_boston.isna().sum()

Year                              0
Month                             0
logan_passengers                  0
logan_intl_flights                0
hotel_occup_rate                  0
hotel_avg_daily_rate              0
total_jobs                        0
unemp_rate                        0
labor_force_part_rate             0
pipeline_unit                     0
pipeline_total_dev_cost           0
pipeline_sqft                     0
pipeline_const_jobs               0
foreclosure_pet                   0
foreclosure_deeds                 0
med_housing_price                 0
housing_sales_vol                 0
new_housing_const_permits         0
new-affordable_housing_permits    0
dtype: int64

In [34]:
econ_inds_boston.loc[econ_inds_boston["Month"]]

Unnamed: 0,Year,Month,logan_passengers,logan_intl_flights,hotel_occup_rate,hotel_avg_daily_rate,total_jobs,unemp_rate,labor_force_part_rate,pipeline_unit,pipeline_total_dev_cost,pipeline_sqft,pipeline_const_jobs,foreclosure_pet,foreclosure_deeds,med_housing_price,housing_sales_vol,new_housing_const_permits,new-affordable_housing_permits
1,2013,2,1878731,2587,0.645,165.497062,0,0.063,0.591,557,360700000,1001911,1089.314,50,10,359500,332,196,13
2,2013,3,2469155,3250,0.819,187.45,0,0.061,0.591,803,343200000,820556,1036.464,6,6,400000,479,12,0
3,2013,4,2551246,3408,0.855,233.435374,0,0.061,0.593,325,150000000,350000,453.0,14,8,403000,558,91,24
4,2013,5,2676291,3240,0.858,237.89021,0,0.069,0.597,122,222200000,457374,671.044,12,4,425500,809,63,0
5,2013,6,2824862,3402,0.911,244.454324,0,0.078,0.608,296,124500000,433868,375.99,14,13,445000,864,352,47
6,2013,7,2978718,3660,0.886,221.195479,0,0.077,0.608,1228,610200000,1385259,1842.774,19,11,429000,909,462,44
7,2013,8,3020074,3700,0.917,223.003079,0,0.073,0.606,134,57897696,283502,175.0,11,8,435000,989,20,3
8,2013,9,2504491,3181,0.881,255.497825,0,0.072,0.598,886,1037000000,1441886,3131.74,19,6,438000,677,357,23
9,2013,10,2672640,3056,0.915,287.465922,0,0.069,0.598,860,849324048,1794758,2564.958625,15,10,465000,619,42,185
10,2013,11,2296667,2708,0.786,222.5491,0,0.064,0.6,1005,2325500000,4642045,7023.01,15,3,450000,570,712,166
