In [1]:
# define dependencies
import pandas as pd
from sqlalchemy import create_engine

## Loading the Data Sets

In [2]:
# store COVID dataset into DataFrame
covid_csv = "datasets/1_county_level_confirmed_cases.csv"
covid_df = pd.read_csv(covid_csv, dtype=str)
covid_df.head()

Unnamed: 0,last_update,location_type,state,county_name,county_name_long,fips_code,lat,lon,NCHS_urbanization,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000
0,2020-09-09 01:28:59 UTC,county,Alabama,Autauga,"Autauga, Alabama, US",1001,32.53952745,-86.64408227,Medium metro,55200.0,1385,2509.06,23,41.67
1,2020-09-09 01:28:59 UTC,county,Alabama,Baldwin,"Baldwin, Alabama, US",1003,30.72774991,-87.72207058,Small metro,208107.0,4609,2214.73,42,20.18
2,2020-09-09 01:28:59 UTC,county,Alabama,Barbour,"Barbour, Alabama, US",1005,31.868263,-85.3871286,Non-core,25782.0,617,2393.14,7,27.15
3,2020-09-09 01:28:59 UTC,county,Alabama,Bibb,"Bibb, Alabama, US",1007,32.99642064,-87.1251146,Large fringe metro,22527.0,562,2494.78,6,26.63
4,2020-09-09 01:28:59 UTC,county,Alabama,Blount,"Blount, Alabama, US",1009,33.98210918,-86.56790593,Large fringe metro,57645.0,1071,1857.92,12,20.82


In [3]:
# store poverty dataset into DataFrame
poverty_csv = "datasets/poverty_csv.csv"
poverty_df = pd.read_csv(poverty_csv)
poverty_df.head()

Unnamed: 0,Table with column headers in rows 3 and 4,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87
0,Table: Estimated Population in Poverty Univer...,,,,,,,,,,...,,,,,,,,,,
1,,,,,July 2018 ACS-Like Poverty Universe for 2018 E...,,,,July 2017 ACS-Like Poverty Universe for 2017 E...,,...,,,July 2000 CPS-Like Poverty Universe for IY 199...,,,,July 1999 CPS-Like Poverty Universe for IY 199...,,,
2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related",...,"Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4"
3,00,000,United States,US,319184033,52529919,72163269,19301529,317741588,52669201,...,71741141,19181906,276207757,51642359,71684956,18968750,271059449,51060953,71338364,19382484
4,01,000,Alabama,AL,4763811,781913,1069994,284188,4752519,790771,...,1104080,296196,4368014,804291,1120718,293558,4348444,789510,1088427,295264


## Cleaning/Formatting the DataFrames

### Cleaning the COVID DF
* Columns removed: 'last_update', 'location_type', 'county_name_long',	'NCHS_urbanization'.
* last_update: since the file will be used one time, stays on record the last update date but the column is removed.
* The data set was loaded in string format. Change the columns 'total_population	confirmed	confirmed_per_100000	deaths	deaths_per_100000' 

In [4]:
covid_data = covid_df[['state','county_name','fips_code','lat','lon','total_population','confirmed','confirmed_per_100000','deaths','deaths_per_100000']]
covid_data.head()

Unnamed: 0,state,county_name,fips_code,lat,lon,total_population,confirmed,confirmed_per_100000,deaths,deaths_per_100000
0,Alabama,Autauga,1001,32.53952745,-86.64408227,55200.0,1385,2509.06,23,41.67
1,Alabama,Baldwin,1003,30.72774991,-87.72207058,208107.0,4609,2214.73,42,20.18
2,Alabama,Barbour,1005,31.868263,-85.3871286,25782.0,617,2393.14,7,27.15
3,Alabama,Bibb,1007,32.99642064,-87.1251146,22527.0,562,2494.78,6,26.63
4,Alabama,Blount,1009,33.98210918,-86.56790593,57645.0,1071,1857.92,12,20.82


### Cleaning  the Poverty DF
* Removing the three first rows

In [5]:
# removing the three first rows
poverty_modified = poverty_df.drop(poverty_df.index[0:2])
poverty_modified.head()

Unnamed: 0,Table with column headers in rows 3 and 4,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82,Unnamed: 83,Unnamed: 84,Unnamed: 85,Unnamed: 86,Unnamed: 87
2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related",...,"Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4"
3,00,000,United States,US,319184033,52529919,72163269,19301529,317741588,52669201,...,71741141,19181906,276207757,51642359,71684956,18968750,271059449,51060953,71338364,19382484
4,01,000,Alabama,AL,4763811,781913,1069994,284188,4752519,790771,...,1104080,296196,4368014,804291,1120718,293558,4348444,789510,1088427,295264
5,01,001,Autauga County,AL,55073,9677,12987,,55021,9911,...,12377,,43711,9245,12507,,43524,8856,12148,
6,01,003,Baldwin County,AL,215255,34508,46265,,209922,34058,...,34503,,139273,25048,34302,,136585,24609,33859,


In [6]:
new_header = poverty_modified.iloc[0]
poverty_nh = poverty_modified[1:]
poverty_nh.columns =new_header
poverty_nh.head()

2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages","Poverty Universe, Age 5-17 related","Poverty Universe, Age 0-17","Poverty Universe, Age 0-4","Poverty Universe, All Ages.1","Poverty Universe, Age 5-17 related.1",...,"Poverty Universe, Age 0-17.1","Poverty Universe, Age 0-4.1","Poverty Universe, All Ages.2","Poverty Universe, Age 5-17 related.2","Poverty Universe, Age 0-17.2","Poverty Universe, Age 0-4.2","Poverty Universe, All Ages.3","Poverty Universe, Age 5-17 related.3","Poverty Universe, Age 0-17.3","Poverty Universe, Age 0-4.3"
3,0,0,United States,US,319184033,52529919,72163269,19301529.0,317741588,52669201,...,71741141,19181906.0,276207757,51642359,71684956,18968750.0,271059449,51060953,71338364,19382484.0
4,1,0,Alabama,AL,4763811,781913,1069994,284188.0,4752519,790771,...,1104080,296196.0,4368014,804291,1120718,293558.0,4348444,789510,1088427,295264.0
5,1,1,Autauga County,AL,55073,9677,12987,,55021,9911,...,12377,,43711,9245,12507,,43524,8856,12148,
6,1,3,Baldwin County,AL,215255,34508,46265,,209922,34058,...,34503,,139273,25048,34302,,136585,24609,33859,
7,1,5,Barbour County,AL,21979,3848,5106,,22224,3901,...,7148,,26480,5422,7341,,25482,5138,6966,


In [7]:
# selecting the columns of interest by index: 'State FIPS code', 'County FIPS code', 'Name', 'State Postal Code', 'Poverty Universe, All Ages'
poverty_select = poverty_nh.iloc[:, 0:5]
poverty_select.head()

2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages"
3,0,0,United States,US,319184033
4,1,0,Alabama,AL,4763811
5,1,1,Autauga County,AL,55073
6,1,3,Baldwin County,AL,215255
7,1,5,Barbour County,AL,21979


In [8]:
# removing state names..... later
poverty_drop = poverty_select.loc[poverty_select['County FIPS code'] != '000']
poverty_drop.head()

2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages"
5,1,1,Autauga County,AL,55073
6,1,3,Baldwin County,AL,215255
7,1,5,Barbour County,AL,21979
8,1,7,Bibb County,AL,20212
9,1,9,Blount County,AL,57238


In [9]:
# combining the two FIPS columns to get the StateCounty full FIPS code:
poverty_drop['fips_code'] = poverty_drop['State FIPS code'] + poverty_drop['County FIPS code']
poverty_drop.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


2,State FIPS code,County FIPS code,Name,State Postal Code,"Poverty Universe, All Ages",fips_code
5,1,1,Autauga County,AL,55073,1001
6,1,3,Baldwin County,AL,215255,1003
7,1,5,Barbour County,AL,21979,1005
8,1,7,Bibb County,AL,20212,1007
9,1,9,Blount County,AL,57238,1009


In [10]:
# selecting the columns the final data frame
poverty_data = poverty_drop.iloc[:, 2:6]

In [11]:
poverty_data = poverty_data.rename(columns={'Poverty Universe, All Ages':'poverty',
                                            'State Postal Code': 'state_code',
                                            'Name': 'county_name'})
poverty_data.head()

2,county_name,state_code,poverty,fips_code
5,Autauga County,AL,55073,1001
6,Baldwin County,AL,215255,1003
7,Barbour County,AL,21979,1005
8,Bibb County,AL,20212,1007
9,Blount County,AL,57238,1009
