## CENSUS DATABASE 2011-2018

In [None]:
import pandas as pd
from census import Census
import requests
import os


# Census API Key
from config import api_key

# provide the api key and the year to establish a session
c = Census(api_key, year=2017)

# Set an option to allow up to 300 characters to print in each column
pd.set_option('max_colwidth', 300)

In [None]:
# query for all tables
tables = c.acs5.tables()

# The tables variable contains a list of dicts, so we can convert directly to a dataframe
table_df = pd.DataFrame(tables)
print(f"Number of available tables: {len(table_df)}")
table_df.head()

In [None]:
# Filter database : Income
table_df[table_df['description'].str.contains("FAMILY")]

In [None]:
# Determine which table you're interested in
table_id = 'B19119'

# Capture the variables URL from the table_df
url = table_df.loc[table_df['name']==table_id, 'variables'].values[0]

# Make the API call
response = requests.get(url).json()

# convert the response to a DataFrame
variables = pd.DataFrame(response['variables']).transpose()

print(f"Number of available variables: {len(variables)}")
variables.head(15)

In [None]:
variables[variables['predicateType']=='int'].head()

In [None]:
census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E", "B19301_001E"), 
                         {'for': 'zip code tabulation area:*'})

census_data[:5]

In [None]:
# Convert to DataFrame
census_pd = pd.DataFrame(census_data)

# Renaming columns to be more user-friendly
census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "NAME": "Name", 
                                      "zip code tabulation area": "Zipcode"})

# Since Census doesn't provide the poverty rate, we can divide Poverty Count by Population to calculate it ourselves
#census_pd["Poverty Rate"] = 100 * census_pd["Poverty Count"].astype(int) / census_pd["Population"].astype(int)

# Reorder columns and only include ones we're interested in for the final DataFrame
census_pd = census_pd[["Zipcode", "Population", "Median Age", "Household Income",
                       "Per Capita Income"]]

# Visualize
print("Total number of zip codes in response: " + str(len(census_pd)))
census_pd.head(50)

In [None]:
census_pd.to_csv("census_data_2018.csv", encoding="utf-8", index=False)

## EXPORTED CSV FROM CENSUS DATABASE FORMATTING

In [None]:
census_2015 = os.path.join("census_data_2015")
clinical_trial_csv = os.path.join("data","clinicaltrial_data.csv")

# Read with Pandas
mouse_pd = pd.read_csv(mouse_drug_csv)
clinical_pd = pd.read_csv(clinical_trial_csv)
# mouse_pd.head()
# clinical_pd.head()