In [77]:
import pandas as pd
import numpy as np
import requests
import plotly.express as px

# `Request Data from Census API`

In [41]:
# available datasets: https://api.census.gov/data.html

In [42]:
# get data from api
HOST = "https://api.census.gov/data"
year = "2018"
dataset = "acs/acs5"
base_url = "/".join([HOST, year, dataset])
predicates = {}
predicates["get"] = "NAME,B19083_001E,B19113_001E,B23025_003E,B23025_005E,B01001_001E,B25004_001E"
predicates["for"] = ('county', '*')
response = requests.get(base_url, params=predicates)
df = pd.read_json(response.text) 

# cleaning df
df. columns = df. iloc[0]
df = df. drop(0) 
df = df. reset_index(drop=True) 
df.columns = ['Location','gini_index','median_family_income','employed','unemployed','population','vacant_housing', 'state_id', 'county_id']
df

Unnamed: 0,Location,gini_index,median_family_income,employed,unemployed,population,vacant_housing,state_id,county_id
0,"Sedgwick County, Kansas",0.4554,69832,260607,13610,512064,21906,20,173
1,"Republic County, Kansas",0.4314,65655,2391,59,4686,634,20,157
2,"Graham County, Kansas",0.4176,57083,1191,42,2545,260,20,065
3,"Douglas County, Kansas",0.4714,82294,69358,3085,119319,3438,20,045
4,"Sheridan County, Kansas",0.4180,69637,1290,29,2506,123,20,179
...,...,...,...,...,...,...,...,...,...
3215,"Adams County, Idaho",0.4228,57050,1707,190,4019,1008,16,003
3216,"Jerome County, Idaho",0.4154,55933,11051,444,23431,622,16,053
3217,"Lewis County, Idaho",0.4586,53201,1609,94,3845,285,16,061
3218,"Owyhee County, Idaho",0.4761,49872,5198,234,11455,627,16,073


In [76]:
# convert str to number
df['gini_index'] = pd.to_numeric(df['gini_index'])
df['median_family_income'] = pd.to_numeric(df['median_family_income'])
df['employed'] = pd.to_numeric(df['employed'])
df['unemployed'] = pd.to_numeric(df['unemployed'])
df['population'] = pd.to_numeric(df['population'])
df['vacant_housing'] = pd.to_numeric(df['vacant_housing'])

# get percent unemployed
df['percent_unemployed'] = df.unemployed / df.employed * 100

# create new df of columns needed for analysis
dff = df[['gini_index','vacant_housing','percent_unemployed', 'median_family_income']]
dff.apply(pd.to_numeric)

Unnamed: 0,gini_index,vacant_housing,percent_unemployed,median_family_income
0,0.4554,21906,5.222423,69832.0
1,0.4314,634,2.467587,65655.0
2,0.4176,260,3.526448,57083.0
3,0.4714,3438,4.447937,82294.0
4,0.4180,123,2.248062,69637.0
...,...,...,...,...
3215,0.4228,1008,11.130639,57050.0
3216,0.4154,622,4.017736,55933.0
3217,0.4586,285,5.842138,53201.0
3218,0.4761,627,4.501731,49872.0


# `Distribution of Income`

In [71]:
fig = px.histogram (dff, x="median_family_income", nbins=100, marginal = 'box')
fig.show()

In [91]:
# split up income into quartiles
dff['income_quartile'] = pd.qcut(dff['median_family_income'],q=4,labels=False,precision=0,duplicates='raise')
# Zero-based, raise by 1
dff['income_quartile'] = dff['income_quartile'] + 1

dff.sort_values(by=['income_quartile'], inplace=True, ascending=True, na_position='first')
dff.dropna(subset = ['income_quartile'], inplace=True)
dff['income_quartile'] = dff['income_quartile'].astype(str)

dff

Unnamed: 0,gini_index,vacant_housing,percent_unemployed,median_family_income,income_quartile
1610,0.5034,662,18.431029,32229.0,1.0
1557,0.4804,7423,17.509443,29425.0,1.0
1556,0.5262,3943,25.864442,18166.0,1.0
1555,0.5814,13342,14.176822,21107.0,1.0
1554,0.4721,1447,26.959659,22155.0,1.0
...,...,...,...,...,...
2969,0.4895,790,4.014194,80491.0,4.0
2970,0.4237,3743,5.296145,77596.0,4.0
2971,0.4879,4522,1.445584,110592.0,4.0
1853,0.4279,1731,2.115739,116902.0,4.0


# `Income vs Unemployment by income quartile`

In [96]:
fig = px.scatter(dff, x='median_family_income',y='percent_unemployed', color='income_quartile')
fig.show()

# `Scatter plot and regression by income quartile`
# `Enter x and y axes`

In [95]:
x_axis = 'gini_index'

y_axis = 'percent_unemployed'


fig = px.scatter(dff, x=x_axis, y=y_axis, facet_col='income_quartile', facet_col_wrap = 4, color='income_quartile', trendline='ols')
fig.show()