# Fix Population Data

Use Cornelia's mapping from census tracts -> zip code to correctly map the CA population to schools

In [1]:
# basics
import pandas as pd 
import numpy as np
import os 
import re
from datetime import datetime
from tqdm.notebook import tqdm
tqdm.pandas()
import requests
import urllib

# plotting
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import plotly.express as px
import seaborn as sns

# modeling
import statsmodels.api as sm

pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [5]:
# local or gdrive
path_source = 'local'

if path_source == 'gdrive':
  from google.colab import drive
  drive.mount('/content/gdrive')
  data_path = '/content/gdrive/MyDrive/Classes/W210_capstone/W210_Capstone/Data'
  #env_path = '/content/gdrive/MyDrive/.env'
  
elif path_source == 'local':
  data_path = '/Users/tj/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'
  #env_path = '/content/gdrive/MyDrive/.env'

elif path_source == 'work':
  data_path = '/Users/trevorjohnson/trevorj@berkeley.edu - Google Drive/My Drive/Classes/W210_capstone/W210_Capstone/Data'

# Read Data

In [65]:
# read in full joined dataset:
df = pd.read_parquet(os.path.join(data_path, 'joined_data/joined_open_schools_only_10-10-22.parquet'))

# read in census data
df_census = pd.read_csv(os.path.join(data_path, 'census/census_bureau_clean/census_bureau.csv'))

# zcta -> zip crosswalk
df_zips = pd.read_csv(os.path.join(data_path, 'census/ZiptoZcta_Crosswalk_2021.csv'))


# Data Clean

In [66]:
# df_zip cleaning:
# make all the cols lowercase
df_zips.columns = df_zips.columns.str.lower()
# only need CA
df_zips = df_zips[df_zips['state'] == 'CA']
# convert to int
df_zips['zcta'] = df_zips['zcta'].astype(int)


# Census cleaning:
df_census['pop_under19_male']   = df_census['population_0_4_male']   + df_census['population_5_9_male']   + df_census['population_10_14_male']   + df_census['population_15_19_male']
df_census['pop_under19_female'] = df_census['population_0_4_female'] + df_census['population_5_9_female'] + df_census['population_10_14_female'] + df_census['population_15_19_female']

# Data Checks

Strangly there are two zip code fields in our dataset. I think the one labeled "Zip Code" is for something else, and we should probs only worry about the "school_zip". They have a ~99.9% match rate.

In [70]:
df2 = df[['year', 'school_zip', 'year_month', 'pop_under19_female', 'pop_under19_female', 'total_pop_under19']]

# do the two zip codes in this file always align? pretty much always, but not 100%
# let's not worry about this, and focus on the school zips
match_rate = (df['school_zip'].astype(str) == df['Zip Code']).sum() / df.shape[0]
print(match_rate)

0.9988100594282459


Likely will want to use the option where: zip_join_type = Zip matches ZCTA

In [43]:
#df_zips.sort_values('zcta')
df_zips['zip_join_type'].value_counts()

Zip matches ZCTA               1761
Spatial join to ZCTA            833
populated ZCTA, missing zip       1
Name: zip_join_type, dtype: int64

In [49]:
a1 = df_zips['zip_code'].drop_duplicates().shape[0]
print(f'num distinct CA zips in mapping file: {a1}')

a2 = df[['school_zip']].drop_duplicates().shape[0]
print(f'num distinct school zips in joined df file: {a2}')

a3 = df_census['zip'].drop_duplicates().shape[0]
print(f'num distinct census tracts in population file: {a3}')

num distinct CA zips in mapping file: 2590
num distinct school zips in joined df file: 1391
num distinct census tracts in population file: 1769


In [80]:
df_zips2 = df_zips[df_zips['zip_join_type'] == 'Zip matches ZCTA']
x1 = set(df_zips2['zcta'].to_list())


x2 = set(df_census['zip'].to_list()) 

# 99.5% of the census zcta's are in the zip matching file
print(len(x1 & x2) / len(x2))

print('\n here are the 8 zctas that dont match')
print(x2 - x1)

0.9954776710005653

 here are the 8 zctas that dont match
{97635, 89060, 89061, 89010, 95250, 95314, 89019, 89439}


I looked up those 8 census tracts manually:
- 6 of them are actually outside of CA (on the boarder of Oregon or Nevada). And they are labeled as "Zip matches ZCTA" when I look them up online. 
- 2 of them: 95250, 95314. Are actually not found in the census tract list, but are valid CA zip code. 


## Thus, I think our joined dataset is fine to use as-is with the way we've used the zips.