# Question 1 - Data wrangling

In [None]:
import pandas as pd
import geopandas as gpd
import requests, json

Here are the variables of interest:
- Median household income: B19013_001E
- Median gross rent: B25064_001E
- Units in housing: DP04_0006E to DP04_00013E

First we get median household income and median gross rent from U.S. Census via an API request.

In [None]:
#we have to make a different request bc this variable isn't in the "profiles" subsection
rs = "https://api.census.gov/data/2020/acs/acs5?get=NAME,B19013_001E,B25064_001E&for=tract:*&in=state:06"
r = requests.get(rs)
d=json.loads(r.text)
IncomeRentDf = pd.DataFrame(d[1:], columns = d[0])

Then we repeat the same process, except this time we are pulling housing data from another dataset from the U.S. Census.

In [None]:
rs2 = "https://api.census.gov/data/2020/acs/acs5/profile?get=NAME,DP04_0006E,DP04_0007E,DP04_0008E,DP04_0009E,DP04_0010E,DP04_0011E,DP04_0012E,DP04_0013E&for=tract:*&in=state:06"
r2 = requests.get(rs2)
d2=json.loads(r2.text)
HousingDf = pd.DataFrame(d2[1:], columns = d2[0])

We then renamed the columns to make them more understandable.

In [None]:
IncomeRentDf.rename(columns = {'B19013_001E': 'Income', 'B25064_001E': 'Rent'}, inplace= True)

We create a `GEOID` column by combining the `state`, `county`, and `tract` column in preparation for a tabular join.

In [None]:
columns = ['tract', 'county', 'state']
for i in columns:
    IncomeRentDf[i] = IncomeRentDf[i].astype(str)
    HousingDf[i] = HousingDf[i].astype(str)

HousingDf['GEOID'] = HousingDf['state'] + HousingDf['county'] + HousingDf['tract']
HousingDf['GEOID'] = HousingDf['GEOID'].astype(int)
IncomeRentDf['GEOID'] = IncomeRentDf['state'] + IncomeRentDf['county'] + IncomeRentDf['tract']
IncomeRentDf['GEOID'] = IncomeRentDf['GEOID'].astype(int)

Here we join `HousingDf` and `IncomeRentDf` to a new dataframe - `censusDf`.

In [None]:
censusDf = HousingDf.set_index("GEOID").join(IncomeRentDf.set_index("GEOID"), rsuffix = '_remove')
#dropping duplicate and useless columns
censusDf.drop(columns = ['NAME', 'NAME_remove', 'state', 'state_remove', 'county', 'county_remove', 'tract', 'tract_remove'], inplace = True)

Here we create a geodataframe for the CalEnviroscreen 4.0 data by reading a shapefile retrieved from [OEHHA](https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-40). 

CalEnviroScreenGdf = gpd.read_file('data/CES4/CES4 Final Shapefile.shp')
CalEnviroScreenGdf['Tract'] = CalEnviroScreenGdf['Tract'].astype(int)

We then join `CalEnviroScreenGdf` and `censusDf` into `tractsDf`.

In [None]:
tractsDf = CalEnviroScreenGdf.set_index('Tract').join(censusDf, how='left')

Now we output the joined dataframe as a 