In [3]:
# Dependencies and Setup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import rcParams
import scipy.stats as sts
import os
from collections import Counter
import requests
import json
from census import Census

# API Keys
from api_keys import gkey
from api_keys import census_key

## Using Census API to get data for different US zip codes 

In [None]:
#Recording different year census data
all_census=[]
for i in [2012,2014,2015,2017,2019]:
    c = Census(census_key, year=i)
    census_data = c.acs5.get(("NAME", "B19013_001E", "B02001_002E","B02001_003E",
                           "B03001_003E","B02001_005E","B01003_001E", "B01002_001E",
                          "B19301_001E","B17001_002E","B23025_005E","B25077_001E","B08136_003E",
                          "B25035_001E","B25088_002E","B25064_001E","B08301_001E",
                          "B08301_010E","B08301_003E","B08136_007E","B15003_017E",
                          "B15003_022E","B15003_002E","B16001_002E","B16001_003E"),
                          {'for': 'zip code tabulation area:*'})

    # Convert to DataFrame
    census_df = pd.DataFrame(census_data)

    # Column Reordering
    census_df = census_df.rename(columns={"B01003_001E": "Population",
                                      "B02001_002E":"Population White",
                                      "B02001_003E": "Population Black",
                                      "B03001_003E": "Population Hispanic",
                                      "B02001_005E": "Population Asian",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "B23025_005E": "Unemployment Count",
                                      "B25077_001E":"House Value",
                                      "B25035_001E": "House Construction Year",
                                      "B25088_002E":"Monthly Owner Cost",
                                      "B25064_001E":"Monthly Rent",
                                      "B08301_001E":"Total Transport",
                                      "B08301_010E":"Public Transport",
                                      "B08301_003E":"Personal Transport",
                                      "B08136_007E":"Commute Time Public",
                                      "B08136_003E":"Commute Time Car",
                                      "B15003_017E":"High School Count",
                                      "B15003_022E":"College Count",
                                      "B15003_002E":"Uneducated Count",
                                      "B16001_002E":"English Language",
                                      "B16001_003E":"Spanish Language",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})

    # Add in Poverty Rate (Poverty Count / Population)
    census_df["Poverty Rate"] = (100 * 
                census_df["Poverty Count"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Employment Rate (Employment Count / Population)
    census_df["Unemployment Rate"] = (100 * 
                census_df["Unemployment Count"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in High school education Rate (High School Count / Population)
    census_df["High School Rate"] = (100 * 
                census_df["High School Count"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in College Education Rate (College Count / Population)
    census_df["College Rate"] = (100 * 
                census_df["College Count"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Uneducated Rate (uneducated Count / Population)
    census_df["Uneducated Rate"] = (100 * 
                census_df["Uneducated Count"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Public Transport Rate (Public Transport / Total Transportation)
    census_df["Public Transport Rate"] = (100 * 
                census_df["Public Transport"].astype(
                    int) / census_df["Total Transport"].astype(int))

    # Add in Personal Transport Rate (Personal Transport  / Total transportation)
    census_df["Personal Transport Rate"] = (100 * 
                census_df["Personal Transport"].astype(
                    int) / census_df["Total Transport"].astype(int))
    
    # Add in English Language Rate
    #census_df["English Language Rate"] = (100 * 
                #census_df["English Language"].astype(
                    #int) / census_df["Population"].astype(int))
    # Add in Spanish Language Rate
    #census_df["Spanish Language Rate"] = (100 * 
                #census_df["Spanish Language"].astype(
                    #int) / census_df["Population"].astype(int))

    # Add in White Population Rate
    census_df["White Population Rate"] = (100 * 
                census_df["Population White"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Black Population Rate
    census_df["Black Population Rate"] = (100 * 
                census_df["Population Black"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Hispanic Population Rate (Population Hispanic / Total Population)
    census_df["Hispanic Population Rate"] = (100 * 
                census_df["Population Hispanic"].astype(
                    int) / census_df["Population"].astype(int))

    # Add in Asian Population Rate (Population Asian  / Total population)
    census_df["Asian Population Rate"] = (100 * 
                census_df["Population Asian"].astype(
                    int) / census_df["Population"].astype(int))

    # Final DataFrame
    census_df = census_df[["Zipcode", "Population", "Median Age", "Household Income",
                       "Per Capita Income", "Poverty Rate", 
                       "Unemployment Rate", "House Value","House Construction Year",
                      "Monthly Owner Cost", "Monthly Rent","Public Transport Rate",
                      "Personal Transport Rate","Commute Time Public","Commute Time Car",
                    "High School Rate","College Rate","Uneducated Rate", 'English Language',#"English Language Rate",
                      "Spanish Language","White Population Rate","Black Population Rate",
                      "Hispanic Population Rate","Asian Population Rate"]]
    #appending dataframe to all census list
    all_census.append(census_df)

In [None]:
# Save different years data as a csv 
for i,census in zip([2012,2014,2015,2017,2019],all_census):
    census.to_csv(f"output_census/census_data_{i}.csv", encoding="utf-8", index=False)

### Reading US census csv for different years 

In [4]:
#reading 2012 census data
path_2012=os.path.join('output_census', 'census_data_2012.csv')
census_2012=pd.read_csv(path_2012)

In [5]:
#reading 2014 census data
path_2014=os.path.join('output_census', 'census_data_2014.csv')
census_2014=pd.read_csv(path_2012)

In [6]:
#reading 2015 census data
path_2015=os.path.join('output_census', 'census_data_2015.csv')
census_2015=pd.read_csv(path_2012)

In [7]:
#reading 2017 census data
path_2017=os.path.join('output_census', 'census_data_2017.csv')
census_2017=pd.read_csv(path_2012)

In [8]:
#reading 2019 census data
path_2019=os.path.join('output_census', 'census_data_2019.csv')
census_2019=pd.read_csv(path_2012)

In [9]:
census_2012

Unnamed: 0,Zipcode,Population,Median Age,Household Income,Per Capita Income,Poverty Rate,Unemployment Rate,House Value,House Construction Year,Monthly Owner Cost,...,Commute Time Car,High School Rate,College Rate,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate
0,2655,3846.0,54.6,73323.0,50951.0,5.460218,4.056162,443500.0,1971.0,2566.0,...,,13.624545,19.032761,0.234009,88.715549,1.248050,96.515861,0.780031,1.586063,0.000000
1,2657,2974.0,52.9,46031.0,45142.0,14.122394,8.675185,467100.0,1945.0,2118.0,...,,13.214526,29.690652,0.874243,89.307330,3.227976,89.845326,5.749832,2.958978,0.638870
2,2659,741.0,61.0,51466.0,36133.0,5.668016,4.183536,469800.0,1972.0,2082.0,...,,19.163293,16.329285,0.000000,89.203779,1.619433,95.546559,2.564103,2.564103,0.000000
3,2660,5881.0,51.3,48617.0,28784.0,13.977215,2.941677,342800.0,1976.0,1685.0,...,,18.143173,16.595817,0.357082,88.930454,0.136031,89.899677,6.869580,0.850196,0.170039
4,2663,96.0,34.7,21667.0,18307.0,0.000000,16.666667,1000001.0,1959.0,-666666666.0,...,,0.000000,50.000000,0.000000,100.000000,0.000000,100.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33115,97035,23820.0,41.9,72883.0,44637.0,7.371956,5.873216,423400.0,1981.0,2215.0,...,171275.0,6.351805,26.717045,0.062972,80.268682,2.963896,87.099076,0.449202,5.424013,8.673384
33116,97037,882.0,50.8,41048.0,19696.0,24.376417,7.256236,170800.0,1962.0,1458.0,...,,19.501134,11.224490,0.000000,92.290249,0.000000,96.031746,0.113379,0.340136,2.040816
33117,97038,15265.0,35.7,54225.0,23848.0,9.452997,6.269243,242000.0,1979.0,1763.0,...,,15.512611,4.801834,0.714052,81.978382,7.212578,93.999345,1.644284,9.040288,0.517524
33118,97039,411.0,46.4,33056.0,21714.0,31.873479,2.919708,130700.0,1952.0,925.0,...,,15.815085,9.002433,0.243309,85.644769,2.189781,96.836983,0.000000,2.433090,0.000000


In [24]:
combined_census= census_2012.merge(census_2014,
                        how='outer', on='Zipcode', 
                        suffixes=['(2012)', '(2014)']).merge(census_2015,
                        how='outer', on='Zipcode').merge(census_2017,
                        how='outer', on='Zipcode', suffixes=['(2015)', '(2017)']).merge(
                        census_2019,how='outer', on='Zipcode')

In [23]:
combined_census

Unnamed: 0,Zipcode,Population(2012),Median Age(2012),Household Income(2012),Per Capita Income(2012),Poverty Rate(2012),Unemployment Rate(2012),House Value(2012),House Construction Year(2012),Monthly Owner Cost(2012),...,Commute Time Car,High School Rate,College Rate,Uneducated Rate,English Language Rate,Spanish Language Rate,White Population Rate,Black Population Rate,Hispanic Population Rate,Asian Population Rate
0,2655,3846.0,54.6,73323.0,50951.0,5.460218,4.056162,443500.0,1971.0,2566.0,...,,13.624545,19.032761,0.234009,88.715549,1.248050,96.515861,0.780031,1.586063,0.000000
1,2657,2974.0,52.9,46031.0,45142.0,14.122394,8.675185,467100.0,1945.0,2118.0,...,,13.214526,29.690652,0.874243,89.307330,3.227976,89.845326,5.749832,2.958978,0.638870
2,2659,741.0,61.0,51466.0,36133.0,5.668016,4.183536,469800.0,1972.0,2082.0,...,,19.163293,16.329285,0.000000,89.203779,1.619433,95.546559,2.564103,2.564103,0.000000
3,2660,5881.0,51.3,48617.0,28784.0,13.977215,2.941677,342800.0,1976.0,1685.0,...,,18.143173,16.595817,0.357082,88.930454,0.136031,89.899677,6.869580,0.850196,0.170039
4,2663,96.0,34.7,21667.0,18307.0,0.000000,16.666667,1000001.0,1959.0,-666666666.0,...,,0.000000,50.000000,0.000000,100.000000,0.000000,100.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33115,97035,23820.0,41.9,72883.0,44637.0,7.371956,5.873216,423400.0,1981.0,2215.0,...,171275.0,6.351805,26.717045,0.062972,80.268682,2.963896,87.099076,0.449202,5.424013,8.673384
33116,97037,882.0,50.8,41048.0,19696.0,24.376417,7.256236,170800.0,1962.0,1458.0,...,,19.501134,11.224490,0.000000,92.290249,0.000000,96.031746,0.113379,0.340136,2.040816
33117,97038,15265.0,35.7,54225.0,23848.0,9.452997,6.269243,242000.0,1979.0,1763.0,...,,15.512611,4.801834,0.714052,81.978382,7.212578,93.999345,1.644284,9.040288,0.517524
33118,97039,411.0,46.4,33056.0,21714.0,31.873479,2.919708,130700.0,1952.0,925.0,...,,15.815085,9.002433,0.243309,85.644769,2.189781,96.836983,0.000000,2.433090,0.000000


In [16]:
combined_census.columns

Index(['Zipcode', 'Population(2012)', 'Median Age(2012)',
       'Household Income(2012)', 'Per Capita Income(2012)',
       'Poverty Rate(2012)', 'Unemployment Rate(2012)', 'House Value(2012)',
       'House Construction Year(2012)', 'Monthly Owner Cost(2012)',
       ...
       'Commute Time Car', 'High School Rate', 'College Rate',
       'Uneducated Rate', 'English Language Rate', 'Spanish Language Rate',
       'White Population Rate', 'Black Population Rate',
       'Hispanic Population Rate', 'Asian Population Rate'],
      dtype='object', length=116)