# Lab 10

In [2]:
import pandas as pd
import json
import requests
import pprint

import plotly.express as px

# 1. Build the API Request URL

### 1. Start your API Data Request  
- Requests always begin with: https://api.census.gov/data  

In [3]:
base_url = "https://api.census.gov/data"

### 2. Add the Dataset Vintage Year and Name  

In [4]:
dataset_name = "/2021/pep/population"

### 3. Start your Variable Request


In [5]:
get_start = "?get="

### 4. Add your Variables  
- Available Variables for Dataset: https://api.census.gov/data/2019/pep/population/variables.html


In [6]:
get_variables = "NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021"

###   5. Add your Geography  
- Available Geographies for Dataset:  https://api.census.gov/data/2021/pep/population/geography.html  



In [7]:
geography = "&for=state:*"

###   6. Put it all Together and Test  
- Click the Link below to see if you formed the API correctly and if the data coming back is what you want.

In [8]:
request_url = base_url + dataset_name + get_start + get_variables + geography
print("request_url = ", request_url)

request_url =  https://api.census.gov/data/2021/pep/population?get=NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021&for=state:*


# 2. Use *requests* library to make the API call

In [9]:
# Make API Call
r = requests.get(request_url)

api_results = r.json()

In [10]:
#print(api_results)

In [11]:
# pprint makes it possible to see the structure of the returned data -- but it can be very, very long!
# pprint.pprint(api_results)

# 3. Get the data into a Dataframe  
- These Census Data results are in a list and have a specific form:  
  - The first element is a list of column names  
  - The remaining list elements are data  
  
  

In [12]:
df = pd.DataFrame(api_results)

print(df.shape)
df.head()

(53, 6)


Unnamed: 0,0,1,2,3,4,5
0,NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021,state
1,Oklahoma,3962031,3986639,24608,0.6210955947,40
2,Nebraska,1961455,1963692,2237,0.1140479899,31
3,Hawaii,1451911,1441553,-10358,-0.7134046100,15
4,South Dakota,887099,895376,8277,0.9330412953,46


# 4. Get the first Row into columns and then get rid of it

### a. Grab the Column Names out of the First Row of the Dataframe
- Use iloc to point to the first row of the dataframe 

In [13]:
column_names = df.iloc[0]

print(column_names)

0            NAME
1        POP_2020
2        POP_2021
3    NPOPCHG_2021
4    PPOPCHG_2021
5           state
Name: 0, dtype: object


### b. Set the columns property of the Dataframe equal to the column names we grabbed  

In [14]:
df.columns = column_names

print(df.shape)
df.head()

(53, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021,state
0,NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021,state
1,Oklahoma,3962031,3986639,24608,0.6210955947,40
2,Nebraska,1961455,1963692,2237,0.1140479899,31
3,Hawaii,1451911,1441553,-10358,-0.7134046100,15
4,South Dakota,887099,895376,8277,0.9330412953,46


### c. Now Get Rid of the First Row of the Dataframe

In [15]:
df = df.iloc[1:]

print("Lab 10")
print(df.shape)
df.head()

Lab 10
(52, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,NPOPCHG_2021,PPOPCHG_2021,state
1,Oklahoma,3962031,3986639,24608,0.6210955947,40
2,Nebraska,1961455,1963692,2237,0.1140479899,31
3,Hawaii,1451911,1441553,-10358,-0.71340461,15
4,South Dakota,887099,895376,8277,0.9330412953,46
5,Tennessee,6920119,6975218,55099,0.7962146316,47


## Cleaning data

In [16]:
df['NPOPCHG_2021'] = pd.to_numeric(df['NPOPCHG_2021'])
df['PPOPCHG_2021'] = pd.to_numeric(df['PPOPCHG_2021']).astype(float)

In [17]:
df.dtypes

0
NAME             object
POP_2020         object
POP_2021         object
NPOPCHG_2021      int64
PPOPCHG_2021    float64
state            object
dtype: object

In [18]:
cols_to_rename = {
                   'NPOPCHG_2021':'Pop_Change'
                 }
df.rename(columns = cols_to_rename, inplace=True)

print(df.shape)
df.head()

(52, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
1,Oklahoma,3962031,3986639,24608,0.621096,40
2,Nebraska,1961455,1963692,2237,0.114048,31
3,Hawaii,1451911,1441553,-10358,-0.713405,15
4,South Dakota,887099,895376,8277,0.933041,46
5,Tennessee,6920119,6975218,55099,0.796215,47


## Plot 1

In [19]:
df.sort_values(by="Pop_Change", ascending=False, inplace=True)

print(df.shape)
df.head()

(52, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
11,Texas,29217653,29527941,310288,1.061988,48
41,Florida,21569932,21781128,211196,0.979122,12
46,Arizona,7177986,7276316,98330,1.369883,4
16,North Carolina,10457177,10551162,93985,0.898761,37
26,Georgia,10725800,10799566,73766,0.687744,13


In [20]:
df_top = df.iloc[ : 10]

print(df_top.shape)
df_top

(10, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
11,Texas,29217653,29527941,310288,1.061988,48
41,Florida,21569932,21781128,211196,0.979122,12
46,Arizona,7177986,7276316,98330,1.369883,4
16,North Carolina,10457177,10551162,93985,0.898761,37
26,Georgia,10725800,10799566,73766,0.687744,13
18,South Carolina,5130729,5190705,59976,1.168957,45
47,Utah,3281684,3337975,56291,1.715308,49
5,Tennessee,6920119,6975218,55099,0.796215,47
35,Idaho,1847772,1900923,53151,2.876491,16
6,Nevada,3114071,3143991,29920,0.9608,32


In [21]:
df_top.sort_values(by="Pop_Change", ascending=True, inplace=True)

fig = px.bar(df_top,              
             x='Pop_Change', 
             y='NAME',
             text='Pop_Change',
             orientation='h',   
             template='plotly_dark',
             title='Top 10 State Population Changes: 2020 - 2021')

fig.update_traces(textposition='auto', 
                  texttemplate='%{text:.3s}'      # This displays two significant digits with $ 
                 )

fig.show()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_top.sort_values(by="Pop_Change", ascending=True, inplace=True)


## Plot 2

In [22]:
df.sort_values(by="Pop_Change", ascending=True, inplace=True)

print(df.shape)
df.head()

(52, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
38,New York,20154933,19835913,-319020,-1.582838,36
20,California,39499738,39237836,-261902,-0.663047,6
44,Illinois,12785245,12671469,-113776,-0.889901,17
43,Massachusetts,7022220,6984723,-37497,-0.533976,25
22,Louisiana,4651203,4624047,-27156,-0.583849,22


In [23]:
df_bottom = df.iloc[ : 10]

print(df_bottom.shape)
df_bottom

(10, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
38,New York,20154933,19835913,-319020,-1.582838,36
20,California,39499738,39237836,-261902,-0.663047,6
44,Illinois,12785245,12671469,-113776,-0.889901,17
43,Massachusetts,7022220,6984723,-37497,-0.533976,25
22,Louisiana,4651203,4624047,-27156,-0.583849,22
25,Pennsylvania,12989625,12964056,-25569,-0.196842,42
10,District of Columbia,690093,670050,-20043,-2.904391,11
39,Puerto Rico,3281538,3263584,-17954,-0.547122,72
14,Michigan,10067664,10050811,-16853,-0.167397,26
30,New Jersey,9279743,9267130,-12613,-0.13592,34


In [24]:
df_bottom.sort_values(by="Pop_Change", ascending=False, inplace=True)

fig = px.bar(df_bottom,              
             x='Pop_Change', 
             y='NAME',
             text='Pop_Change',
             orientation='h',   
             template='plotly_dark',
             title='Bottom 10 State Population Changes: 2020 - 2021')

fig.update_traces(textposition='auto', 
                  texttemplate='%{text:.3s}'      # This displays two significant digits with $ 
                 )

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Plot 3

In [25]:
df.sort_values(by="PPOPCHG_2021", ascending=False, inplace=True)

print(df.shape)
df.head()

(52, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
35,Idaho,1847772,1900923,53151,2.876491,16
47,Utah,3281684,3337975,56291,1.715308,49
37,Montana,1086193,1104271,18078,1.664345,30
46,Arizona,7177986,7276316,98330,1.369883,4
18,South Carolina,5130729,5190705,59976,1.168957,45


In [26]:
df['PPOPCHG_2021'] = df['PPOPCHG_2021'] / 100

In [27]:
df_percent = df.iloc[ : 10]

print(df_percent.shape)
df_percent

(10, 6)


Unnamed: 0,NAME,POP_2020,POP_2021,Pop_Change,PPOPCHG_2021,state
35,Idaho,1847772,1900923,53151,0.028765,16
47,Utah,3281684,3337975,56291,0.017153,49
37,Montana,1086193,1104271,18078,0.016643,30
46,Arizona,7177986,7276316,98330,0.013699,4
18,South Carolina,5130729,5190705,59976,0.01169,45
24,Delaware,991886,1003384,11498,0.011592,10
11,Texas,29217653,29527941,310288,0.01062,48
41,Florida,21569932,21781128,211196,0.009791,12
6,Nevada,3114071,3143991,29920,0.009608,32
4,South Dakota,887099,895376,8277,0.00933,46


In [28]:
df_percent.sort_values(by="PPOPCHG_2021", ascending=True, inplace=True)

fig = px.bar(df_percent,              
             x='PPOPCHG_2021', 
             y='NAME',
             text='PPOPCHG_2021',
             orientation='h',   
             template='plotly_dark',
             title='Top 10 Percent Population Changes: 2020 - 2021')

fig.update_traces(textposition='auto', 
                  texttemplate='%{text:.1%}'      # This displays two significant digits with $ 
                 )

fig.show()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Optional:  Save the Joined Dataframe as a CSV file

In [29]:
csv_file_to_create = "Lab 10.csv"

filename_with_path = "Data/" + csv_file_to_create
df.to_csv(filename_with_path, index=False)