In [1]:
# Import Dependencies
import pandas as pd
from census import Census
from config import census_key
import cpi # this will be used to determine the cost of everything in today's dollars. cpi.inflate(amount,year of amount)
#install cpi with pip install cpi on command line
cpi.update()
low_memory=False

In [2]:
# Import CSVs 
filepath = "Resources/"
kent_zipcodes = pd.read_csv(filepath+"kent_zipcodes.csv",squeeze=True,header=None)
oakland_zipcodes = pd.read_csv(filepath+"oakland_zipcodes.csv",squeeze=True,header=None)
saginaw_zipcodes = pd.read_csv(filepath+"saginaw_zipcodes.csv",squeeze=True,header=None)



In [3]:
# Create dataframe for each imported CSV and add a column with the corresponding county
kent_zipcodes = pd.DataFrame(kent_zipcodes)
kent_zipcodes["County"] = "Kent"
oakland_zipcodes = pd.DataFrame(oakland_zipcodes)
oakland_zipcodes["County"] = "Oakland"
saginaw_zipcodes = pd.DataFrame(saginaw_zipcodes)
saginaw_zipcodes["County"] = "Saginaw"
# Merge zipcode dataframes and add a column name for Zipcode
query_zipcodes =pd.DataFrame( pd.merge(kent_zipcodes,oakland_zipcodes,how="outer"))
query_zipcodes = pd.DataFrame(pd.merge(query_zipcodes,saginaw_zipcodes,how="outer"))
query_zipcodes.columns = ["Zipcode","County"]
query_zipcodes = query_zipcodes.astype({"Zipcode":object})

In [4]:
query_zipcodes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 0 to 171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Zipcode  172 non-null    object
 1   County   172 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB


In [5]:
# Create dataframe to add census data to. Needed to put in starter values for it to function properly later. Will remove further down.
census_df = pd.DataFrame({"Year":[0],
                          "Zipcode":["0"],
                          "Total Population":[0],
                          "White Population":[0],
                          "Black Population":[0],
                          "Employed Count":[0],
                          "Unemployed Count":[0],
                          "Poverty Count":[0],
                          "Poverty Count - White":[0],
                          "Poverty Count - Black":[0],
                          "Median Income":[0],
                          "Median Rent":[0],
                          "Median Home Value":[0]})

# Get data from API for Census
census_df = census_df.astype({"Year":int,"Zipcode":object})

# For loop to iterate through the desired 10-year period
for year in [2011,2012,2013,2014,2015,2016,2017,2018,2019,2020]: 
#     for index, row in query_zipcodes.iterrows():
#         zip = query_zipcodes.iloc[index,0]
    c = Census(census_key,year) 
    census_data = c.acs5.get(("NAME", "B19013_001E", "B23025_004E", "B23025_005E", "B17001_002E", 
            "B17001A_002E","B17001B_002E", "B01003_001E",
            "B02001_002E", "B02001_003E","B25058_001E", "B25077_001E"
             ), {"for": "zip code tabulation area:*"})
    census_raw_df = pd.DataFrame(census_data)
    
# Add the year to the year column so each iteration can be distinguished
    year_count = len(census_data)
    year_series = pd.Series([year for x in range(year_count)])    
    census_raw_df["Year"] = year_series
    
# Rename Columns
    census_raw_df = census_raw_df.rename(columns = {
            "B19013_001E" : "Median Income", 
            "B23025_004E" : "Employed Count", 
            "B23025_005E" : "Unemployed Count", 
            "B17001_002E" : "Poverty Count", 
            "B17001A_002E" : "Poverty Count - White" , 
            "B17001B_002E" : "Poverty Count - Black" , 
            "B01003_001E" : "Total Population", 
            "B02001_002E" : "White Population", 
            "B02001_003E" : "Black Population", 
            "B25058_001E" : "Median Rent", 
            "B25077_001E" : "Median Home Value", 
            "NAME" : "Name",
            "zip code tabulation area": "Zipcode"})
    
# Recast all variables in the output dataframe to the correct type 
    census_raw_df = census_raw_df.astype({"Year":int,"Zipcode":object,"Median Income":float,"Employed Count":float,"Unemployed Count":float,"Poverty Count":float,
             "Poverty Count - White":float, "Poverty Count - Black":float, "Total Population":float,
             "White Population":float,"Black Population":float, "Median Rent":float,"Median Home Value":float})
    
# Recast the Zipcode in census_df to object. Was running into issues with the zipcode repeatedly trying to cast as a float.
    census_df = census_df.astype({"Zipcode":object})
    
# Merge the new raw df into the finished df. Use outer to maintain all values
    census_df = pd.merge(census_df,census_raw_df,how="outer")

In [7]:
census_df = census_df.sort_values(by=["Zipcode"])
test_df = pd.merge(census_df,query_zipcodes,how="inner", on="Zipcode")
test_df = test_df.sort_values("Total Population")
test_df

Unnamed: 0,Year,Zipcode,Total Population,White Population,Black Population,Employed Count,Unemployed Count,Poverty Count,Poverty Count - White,Poverty Count - Black,Median Income,Median Rent,Median Home Value,Name,state,County


In [None]:
# Remove the name column, which is useless here, and the state column, which only functioned for some years
census_df = census_df.drop(columns = ["Name","state"])

#Remove the sample data added to initialize the dataframe
census_df = census_df.loc[census_df["Year"] != 0]

In [10]:
test = census_df.loc[census_df["Zipcode"] == "48340"]
test2 = query_zipcodes.loc[query_zipcodes["Zipcode"]== 48340]
test2

Unnamed: 0,Zipcode,County
64,48340,Oakland


In [8]:
census_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 331201 entries, 0 to 264925
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Year                   331201 non-null  int64  
 1   Zipcode                331201 non-null  object 
 2   Total Population       331201 non-null  int64  
 3   White Population       331201 non-null  int64  
 4   Black Population       331201 non-null  int64  
 5   Employed Count         331166 non-null  float64
 6   Unemployed Count       331166 non-null  float64
 7   Poverty Count          331166 non-null  float64
 8   Poverty Count - White  331166 non-null  float64
 9   Poverty Count - Black  331166 non-null  float64
 10  Median Income          330203 non-null  float64
 11  Median Rent            330574 non-null  float64
 12  Median Home Value      330370 non-null  float64
 13  Name                   331200 non-null  object 
 14  state                  298080 non-nu

In [9]:
query_zipcodes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 0 to 171
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Zipcode  172 non-null    object
 1   County   172 non-null    object
dtypes: object(2)
memory usage: 4.0+ KB
