# Ex 7.3: Part III (Outside Class)

In [1]:
import pandas as pd
import json
import requests
import pprint

import plotly.express as px

# 1. Build the API Request URL

### 1. How to Start your Data Request  
- Requests always begin with: https://api.census.gov/data  

In [2]:
base_url = "https://api.census.gov/data"

### 2. Add the Dataset Name


In [3]:
dataset_name = "/2020/acs/acs5/profile"

### 3. Start your Variable Request


In [4]:
get_start = "?get="

### 4. Add your Variables  
- **DP02_0066PE**: Percent of Population 25 years and over with Graduate or professional degree


In [5]:
get_variables = "NAME,DP04_0057E,DP04_0057PE,DP04_0058E,DP04_0058PE"

###   5. Add your Geography  


In [6]:
geography = "&for=county:*&in=state:37"

###   6. Put it all Together 

In [7]:
request_url = base_url + dataset_name + get_start + get_variables + geography
print("request_url = ", request_url)

request_url =  https://api.census.gov/data/2020/acs/acs5/profile?get=NAME,DP04_0057E,DP04_0057PE,DP04_0058E,DP04_0058PE&for=county:*&in=state:37


# 2. Use *requests* library to make the API call

In [8]:
# Make API Call
r = requests.get(request_url)

api_results = r.json()

In [9]:
#print(api_results)

In [10]:
# pprint makes it possible to see the structure of the returned data -- but it can be very, very long!
#pprint.pprint(api_results)

In [11]:
type(api_results)

list

# 3. Get the data into a Dataframe  
- These Census Data results are in a list and have a specific form:  
  - The first element is a list of column names  
  - The remaining list elements are data  
  
  

In [12]:
df = pd.DataFrame(api_results)

print(df.shape)
df.head()

(101, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,NAME,DP04_0057E,DP04_0057PE,DP04_0058E,DP04_0058PE,state,county
1,"Anson County, North Carolina",9803,9803,689,7.0,37,007
2,"Beaufort County, North Carolina",20219,20219,1282,6.3,37,013
3,"Brunswick County, North Carolina",59416,59416,1821,3.1,37,019
4,"Cabarrus County, North Carolina",72843,72843,2952,4.1,37,025


# 4. Get the first Row into columns and then get rid of it  

In [13]:
df.columns = df.iloc[0]

df =df.iloc[1:]

print(df.shape)
df.head()

(100, 7)


Unnamed: 0,NAME,DP04_0057E,DP04_0057PE,DP04_0058E,DP04_0058PE,state,county
1,"Anson County, North Carolina",9803,9803,689,7.0,37,7
2,"Beaufort County, North Carolina",20219,20219,1282,6.3,37,13
3,"Brunswick County, North Carolina",59416,59416,1821,3.1,37,19
4,"Cabarrus County, North Carolina",72843,72843,2952,4.1,37,25
5,"Carteret County, North Carolina",30060,30060,1265,4.2,37,31


# Part 3.1: Cleaning Data

In [14]:
two_new_cols = ['County Name', 'State_Name']

df[two_new_cols] = df['NAME'].str.split(' County',1, expand=True)

print(df.shape)
df.head()

(100, 9)


Unnamed: 0,NAME,DP04_0057E,DP04_0057PE,DP04_0058E,DP04_0058PE,state,county,County Name,State_Name
1,"Anson County, North Carolina",9803,9803,689,7.0,37,7,Anson,", North Carolina"
2,"Beaufort County, North Carolina",20219,20219,1282,6.3,37,13,Beaufort,", North Carolina"
3,"Brunswick County, North Carolina",59416,59416,1821,3.1,37,19,Brunswick,", North Carolina"
4,"Cabarrus County, North Carolina",72843,72843,2952,4.1,37,25,Cabarrus,", North Carolina"
5,"Carteret County, North Carolina",30060,30060,1265,4.2,37,31,Carteret,", North Carolina"


In [15]:
df["DP04_0057E"] = pd.to_numeric(df['DP04_0057E'])
df["DP04_0057PE"] = pd.to_numeric(df['DP04_0057PE']).astype(float)
df["DP04_0058E"] = pd.to_numeric(df['DP04_0058E'])
df["DP04_0058PE"] = pd.to_numeric(df['DP04_0058PE'])

In [16]:
df.dtypes

0
NAME            object
DP04_0057E       int64
DP04_0057PE    float64
DP04_0058E       int64
DP04_0058PE    float64
state           object
county          object
County Name     object
State_Name      object
dtype: object

In [17]:
cols_to_rename = {
                   'DP04_0057E':'Vehicles Available (DP04_0057E)', 
                   'DP04_0057PE':'Vehicles Available - Percent (DP04_0057PE)', 
                   'DP04_0058E':'No Vehicles Available (DP04_0058E)', 
                   'DP04_0058PE':'No Vehicles Available - Percent (DP04_0058PE)', 
                   'state' :'FIPS_State',
                   'county' :'FIPS_County'
                 }
df.rename(columns = cols_to_rename, inplace=True)

print(df.shape)
df.head()

(100, 9)


Unnamed: 0,NAME,Vehicles Available (DP04_0057E),Vehicles Available - Percent (DP04_0057PE),No Vehicles Available (DP04_0058E),No Vehicles Available - Percent (DP04_0058PE),FIPS_State,FIPS_County,County Name,State_Name
1,"Anson County, North Carolina",9803,9803.0,689,7.0,37,7,Anson,", North Carolina"
2,"Beaufort County, North Carolina",20219,20219.0,1282,6.3,37,13,Beaufort,", North Carolina"
3,"Brunswick County, North Carolina",59416,59416.0,1821,3.1,37,19,Brunswick,", North Carolina"
4,"Cabarrus County, North Carolina",72843,72843.0,2952,4.1,37,25,Cabarrus,", North Carolina"
5,"Carteret County, North Carolina",30060,30060.0,1265,4.2,37,31,Carteret,", North Carolina"


In [18]:
cols_to_keep = ['County Name', 'Vehicles Available (DP04_0057E)', 'Vehicles Available - Percent (DP04_0057PE)', 'No Vehicles Available (DP04_0058E)', 'No Vehicles Available - Percent (DP04_0058PE)', 'FIPS_State', 'FIPS_County']
df = df[cols_to_keep]

print("Part 3.1: Cleaned, Renamed and Reordered results from API call:")
print(df.shape)
df.head()

Part 3.1: Cleaned, Renamed and Reordered results from API call:
(100, 7)


Unnamed: 0,County Name,Vehicles Available (DP04_0057E),Vehicles Available - Percent (DP04_0057PE),No Vehicles Available (DP04_0058E),No Vehicles Available - Percent (DP04_0058PE),FIPS_State,FIPS_County
1,Anson,9803,9803.0,689,7.0,37,7
2,Beaufort,20219,20219.0,1282,6.3,37,13
3,Brunswick,59416,59416.0,1821,3.1,37,19
4,Cabarrus,72843,72843.0,2952,4.1,37,25
5,Carteret,30060,30060.0,1265,4.2,37,31


# Part 3.2: Top 10 NC Counties for Housing Units with No Vehicle Available (DP04_0058E)

In [19]:
df.sort_values(by="No Vehicles Available (DP04_0058E)", ascending=False, inplace=True)
df_Q2 = df.iloc[ : 10]
df_Q2

Unnamed: 0,County Name,Vehicles Available (DP04_0057E),Vehicles Available - Percent (DP04_0057PE),No Vehicles Available (DP04_0058E),No Vehicles Available - Percent (DP04_0058PE),FIPS_State,FIPS_County
19,Mecklenburg,421950,421950.0,23730,5.6,37,119
95,Wake,410552,410552.0,15217,3.7,37,183
13,Guilford,208234,208234.0,13328,6.4,37,81
49,Forsyth,148890,148890.0,10304,6.9,37,67
10,Durham,130128,130128.0,9128,7.0,37,63
8,Cumberland,127532,127532.0,8802,6.9,37,51
71,New Hanover,97998,97998.0,6219,6.3,37,129
80,Pitt,70683,70683.0,6137,8.7,37,147
33,Buncombe,105177,105177.0,5920,5.6,37,21
50,Gaston,85286,85286.0,5003,5.9,37,71


In [20]:
df_Q2.sort_values(by="No Vehicles Available (DP04_0058E)", ascending=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_Q2.sort_values(by="No Vehicles Available (DP04_0058E)", ascending=True, inplace=True)


In [21]:
fig = px.bar(df_Q2,              
             x='No Vehicles Available (DP04_0058E)', 
             y='County Name',
             text='No Vehicles Available (DP04_0058E)',
             orientation='h',   
             template='plotly_dark',
             title='Q3.2: Top 10 NC Counties for Housing Units with No Vehicle Available (DP04_0058E)')

fig.update_traces(textposition='auto', 
                  texttemplate='%{text:,.0f}'   # This adds commas and formats to zero decimal places
                 )

fig.show()

# Part 3.3: Top 10 NC Counties for % Housing Units with No Vehicle Available (DP04_0058PE)

In [22]:
df.sort_values(by="No Vehicles Available - Percent (DP04_0058PE)", ascending=False, inplace=True)
df_Q3 = df.iloc[ : 10]
df_Q3

Unnamed: 0,County Name,Vehicles Available (DP04_0057E),Vehicles Available - Percent (DP04_0057PE),No Vehicles Available (DP04_0058E),No Vehicles Available - Percent (DP04_0058PE),FIPS_State,FIPS_County
96,Washington,5237,5237.0,780,14.9,37,187
89,Scotland,12951,12951.0,1473,11.4,37,165
31,Bertie,8008,8008.0,905,11.3,37,15
54,Halifax,21061,21061.0,2207,10.5,37,83
41,Clay,5300,5300.0,554,10.5,37,43
17,Lenoir,23494,23494.0,2320,9.9,37,107
48,Edgecombe,20800,20800.0,2043,9.8,37,65
83,Richmond,18201,18201.0,1779,9.8,37,153
94,Vance,16895,16895.0,1636,9.7,37,181
99,Wilson,31968,31968.0,2962,9.3,37,195


In [23]:
df_Q3.sort_values(by="No Vehicles Available - Percent (DP04_0058PE)", ascending=True, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [24]:
fig = px.bar(df_Q3,              
             x='No Vehicles Available - Percent (DP04_0058PE)', 
             y='County Name',
             text='No Vehicles Available - Percent (DP04_0058PE)',
             orientation='h',   
             template='plotly_white',
             title='Q3.3: Top 10 NC Counties for % Housing Units with No Vehicle Available (DP04_0058PE)')

fig.update_traces(textposition='auto', 
                  texttemplate='%{text:,.1f}'   # This adds commas and formats to zero decimal places
                 )

fig.show()

# Save in a csv

In [25]:
csv_file_to_create = "7_3_P3.csv"

filename_with_path = "Data/" + csv_file_to_create
df.to_csv(filename_with_path, index=False)