# Cleaning the Data and exporting to national, state, and county csv files.

In [1]:
import pandas as pd

In [2]:
#making sure we don't lose the leading zeroes in the zip codes
zip_code_column = 'LocationID'

heart_disease_df = pd.read_csv("../Resources/Heart_Disease_Mortality2019-2021.csv", dtype={zip_code_column: str})

heart_disease_df.head()

Unnamed: 0,Year,LocationAbbr,LocationDesc,GeographicLevel,DataSource,Class,Topic,Data_Value,Data_Value_Unit,Data_Value_Type,...,Data_Value_Footnote,StratificationCategory1,Stratification1,StratificationCategory2,Stratification2,TopicID,LocationID,Y_lat,X_lon,Georeference
0,2020,AK,Kenai Peninsula,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,165.1,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Male,Race/Ethnicity,Hispanic,T2,2122,60.193263,-150.280744,POINT (-150.2807443 60.193262972)
1,2020,AL,Walker County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,109.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Hispanic,T2,1127,33.810226,-87.29707,POINT (-87.29707047 33.810226394)
2,2020,AL,St. Clair County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,90.0,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,,Gender,Overall,Race/Ethnicity,Asian,T2,1115,33.716065,-86.31496,POINT (-86.31496031 33.716065391)
3,2020,AR,Yell County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Female,Race/Ethnicity,Asian,T2,5149,35.005864,-93.401676,POINT (-93.40167591 35.00586398)
4,2020,AS,American Samoa County,County,NVSS,Cardiovascular Diseases,Heart Disease Mortality,,"per 100,000 population","Age-adjusted, Spatially Smoothed, 3-year Avera...",...,Insufficient Data,Gender,Male,Race/Ethnicity,Black,T2,60000,-14.301754,-170.719474,POINT (-170.7194738 -14.30175426)


In [3]:
heart_disease_df.columns

Index(['Year', 'LocationAbbr', 'LocationDesc', 'GeographicLevel', 'DataSource',
       'Class', 'Topic', 'Data_Value', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'StratificationCategory1', 'Stratification1', 'StratificationCategory2',
       'Stratification2', 'TopicID', 'LocationID', 'Y_lat', 'X_lon',
       'Georeference'],
      dtype='object')

In [4]:
#clean the dataset and rename columns
heart_disease_df = heart_disease_df[['LocationAbbr', 'LocationDesc', 'GeographicLevel', 'Data_Value', 'Stratification1','Stratification2','LocationID', 'Y_lat', 'X_lon']]

heart_disease_df = heart_disease_df.rename(columns={
    "LocationAbbr": "State",
    "LocationDesc": "LocationDesc",
    "GeographicLevel": "GeographicLevel",
    "Data_Value": "Data Value (Per 100,000 Population)",
    "Stratification1": "Gender",
    "Stratification2": "Race/Ethnicity",
    "LocationID": "LocationID",
    "Y_lat": "Latitude",
    "X_lon": "Longitude"
})
heart_disease_df.head()

Unnamed: 0,State,LocationDesc,GeographicLevel,"Data Value (Per 100,000 Population)",Gender,Race/Ethnicity,LocationID,Latitude,Longitude
0,AK,Kenai Peninsula,County,165.1,Male,Hispanic,2122,60.193263,-150.280744
1,AL,Walker County,County,109.0,Overall,Hispanic,1127,33.810226,-87.29707
2,AL,St. Clair County,County,90.0,Overall,Asian,1115,33.716065,-86.31496
3,AR,Yell County,County,,Female,Asian,5149,35.005864,-93.401676
4,AS,American Samoa County,County,,Male,Black,60000,-14.301754,-170.719474


In [5]:
#state only csv
state_hd_df = heart_disease_df.loc[heart_disease_df["GeographicLevel"] == "State"]
state_hd_df.head()

Unnamed: 0,State,LocationDesc,GeographicLevel,"Data Value (Per 100,000 Population)",Gender,Race/Ethnicity,LocationID,Latitude,Longitude
12,IN,Indiana,State,118.5,Female,Asian,18,39.9128,-86.2757
19,NE,Nebraska,State,120.1,Male,Asian,31,41.5228,-99.8085
22,NY,New York,State,83.4,Overall,More than one race,36,42.9465,-75.512
54,GU,Guam,State,,Female,Black,66,13.4431,144.777
86,AS,American Samoa,State,,Male,Overall,60,-14.2766,-170.367


In [6]:
#county only csv
county_hd_df = heart_disease_df.loc[heart_disease_df["GeographicLevel"] == "County"]
county_hd_df.head()

Unnamed: 0,State,LocationDesc,GeographicLevel,"Data Value (Per 100,000 Population)",Gender,Race/Ethnicity,LocationID,Latitude,Longitude
0,AK,Kenai Peninsula,County,165.1,Male,Hispanic,2122,60.193263,-150.280744
1,AL,Walker County,County,109.0,Overall,Hispanic,1127,33.810226,-87.29707
2,AL,St. Clair County,County,90.0,Overall,Asian,1115,33.716065,-86.31496
3,AR,Yell County,County,,Female,Asian,5149,35.005864,-93.401676
4,AS,American Samoa County,County,,Male,Black,60000,-14.301754,-170.719474


In [9]:
#national csv
national_hd_df = heart_disease_df.loc[heart_disease_df["GeographicLevel"] == "Nation"]
national_hd_df.head()

Unnamed: 0,State,LocationDesc,GeographicLevel,"Data Value (Per 100,000 Population)",Gender,Race/Ethnicity,LocationID,Latitude,Longitude
67197,US,United States,Nation,165.1,Overall,Asian,0,,
67973,US,United States,Nation,253.6,Female,Overall,0,,
68318,US,United States,Nation,292.3,Male,Hispanic,0,,
69192,US,United States,Nation,218.5,Female,American Indian or Alaska Native,0,,
69416,US,United States,Nation,417.5,Male,Native Hawaiian or Other Pacific Islander,0,,


In [10]:
heart_disease_df.to_csv("../Resources/clean_Heart_Disease.csv")
state_hd_df.to_csv("../Resources/state_Heart_Disease.csv")
county_hd_df.to_csv("../Resources/county_Heart_Disease.csv")
national_hd_df.to_csv("../Resources/national_Heart_Disease.csv")