In [13]:
"""
map_data_clean.ipynb

File used to clean up the map/census data from 
https://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Place-Data-202/eav7-hnsx/data

This dataset will be used to create map visualizations
"""

'\nmap_data_clean.ipynb\n\nFile used to clean up the map/census data from \nhttps://chronicdata.cdc.gov/500-Cities-Places/PLACES-Local-Data-for-Better-Health-Place-Data-202/eav7-hnsx/data\n\nThis dataset will be used to create map visualizations\n'

In [14]:
# Importing statements

import pandas as pd
import math

In [15]:
# Importing raw dataset

df = pd.read_csv('PLACES.csv')

In [16]:
# Preview dataset

print(df.columns)
df.head()

Index(['Year', 'StateAbbr', 'StateDesc', 'LocationName', 'DataSource',
       'Category', 'Measure', 'Data_Value_Unit', 'Data_Value_Type',
       'Data_Value', 'Data_Value_Footnote_Symbol', 'Data_Value_Footnote',
       'Low_Confidence_Limit', 'High_Confidence_Limit', 'TotalPopulation',
       'Geolocation', 'LocationID', 'CategoryID', 'MeasureId',
       'DataValueTypeID', 'Short_Question_Text'],
      dtype='object')


Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,...,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,Geolocation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text
0,2020,AL,Alabama,Albertville,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Age-adjusted prevalence,3.6,...,,3.4,3.8,21282,POINT (-86.21214093 34.26441274),100988,HLTHOUT,STROKE,AgeAdjPrv,Stroke
1,2020,AL,Alabama,Alexander City,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,4.6,...,,4.3,4.9,14972,POINT (-85.93972723 32.93023222),101132,HLTHOUT,STROKE,CrdPrv,Stroke
2,2020,AL,Alabama,Alexandria,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,Crude prevalence,26.2,...,,24.9,27.6,3917,POINT (-85.87920682 33.76747698),101180,HLTHOUT,DEPRESSION,CrdPrv,Depression
3,2020,AL,Alabama,Aliceville,BRFSS,Health Outcomes,Cancer (excluding skin cancer) among adults ag...,%,Crude prevalence,7.0,...,,6.7,7.3,2486,POINT (-88.15202475 33.12955557),101228,HLTHOUT,CANCER,CrdPrv,Cancer (except skin)
4,2019,AL,Alabama,Andalusia,BRFSS,Prevention,Cholesterol screening among adults aged >=18 y...,%,Age-adjusted prevalence,86.0,...,,85.4,86.5,9015,POINT (-86.47806878 31.31012814),101708,PREVENT,CHOLSCREEN,AgeAdjPrv,Cholesterol Screening


In [17]:
# Column values exploration

print('Unique Data_Value_Unit: ', df.Data_Value_Unit.unique(), '\n')
print('Unique StateDesc: ', df.StateDesc.unique(), '\n')
print('Unique Year: ', df.Year.unique(), '\n')
print('Unique Short_Question_Text: ', df.Short_Question_Text.unique(), '\n')
print('Unique Measure: ', df.Measure.unique(), '\n')

Unique Data_Value_Unit:  ['%'] 

Unique StateDesc:  ['Alabama' 'Alaska' 'Arizona' 'Arkansas' 'California' 'Louisiana' 'Iowa'
 'Colorado' 'Connecticut' 'Delaware' 'District of Columbia' 'Florida'
 'Georgia' 'Hawaii' 'Idaho' 'Illinois' 'Indiana' 'Kansas' 'Kentucky'
 'Maine' 'Maryland' 'Massachusetts' 'Michigan' 'Minnesota' 'Mississippi'
 'Missouri' 'Montana' 'Nebraska' 'Nevada' 'New Hampshire' 'New Jersey'
 'New Mexico' 'New York' 'South Dakota' 'North Carolina' 'North Dakota'
 'Ohio' 'Oklahoma' 'Oregon' 'Pennsylvania' 'Rhode Island' 'South Carolina'
 'Tennessee' 'Texas' 'Utah' 'Vermont' 'Virginia' 'Washington'
 'West Virginia' 'Wisconsin' 'Wyoming'] 

Unique Year:  [2020 2019] 

Unique Short_Question_Text:  ['Stroke' 'Depression' 'Cancer (except skin)' 'Cholesterol Screening'
 'Diabetes' 'Coronary Heart Disease' 'COPD' 'Arthritis' 'Current Asthma'
 'Annual Checkup' 'Cervical Cancer Screening' 'Obesity'
 'Chronic Kidney Disease' 'High Blood Pressure' 'Binge Drinking'
 'All Teeth Lost' 'H

In [18]:
# How many from 2020 vs 2019

print(len(df[df['Year'] == 2020]))
print(len(df[df['Year'] == 2019]))

912654
135921


In [19]:
# Splitting up dataset by years

df_2020 = df[df['Year'] == 2020]
df_2019 = df[df['Year'] == 2019]

In [20]:
state_codes = {
    'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
    'FL': '12', 'WY': '56', 'PR': '72', 'NJ': '34', 'NM': '35', 'TX': '48',
    'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
    'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
    'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
    'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
    'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
    'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
    'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}

df_2020['FIP'] = df_2020.apply(lambda row: state_codes[row.StateAbbr], axis=1)
df_2019['FIP'] = df_2019.apply(lambda row: state_codes[row.StateAbbr], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2020['FIP'] = df_2020.apply(lambda row: state_codes[row.StateAbbr], axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_2019['FIP'] = df_2019.apply(lambda row: state_codes[row.StateAbbr], axis=1)


In [21]:
df_2020.head()

Unnamed: 0,Year,StateAbbr,StateDesc,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,...,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,Geolocation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text,FIP
0,2020,AL,Alabama,Albertville,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Age-adjusted prevalence,3.6,...,3.4,3.8,21282,POINT (-86.21214093 34.26441274),100988,HLTHOUT,STROKE,AgeAdjPrv,Stroke,1
1,2020,AL,Alabama,Alexander City,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,4.6,...,4.3,4.9,14972,POINT (-85.93972723 32.93023222),101132,HLTHOUT,STROKE,CrdPrv,Stroke,1
2,2020,AL,Alabama,Alexandria,BRFSS,Health Outcomes,Depression among adults aged >=18 years,%,Crude prevalence,26.2,...,24.9,27.6,3917,POINT (-85.87920682 33.76747698),101180,HLTHOUT,DEPRESSION,CrdPrv,Depression,1
3,2020,AL,Alabama,Aliceville,BRFSS,Health Outcomes,Cancer (excluding skin cancer) among adults ag...,%,Crude prevalence,7.0,...,6.7,7.3,2486,POINT (-88.15202475 33.12955557),101228,HLTHOUT,CANCER,CrdPrv,Cancer (except skin),1
5,2020,AL,Alabama,Anniston,BRFSS,Health Outcomes,Diagnosed diabetes among adults aged >=18 years,%,Age-adjusted prevalence,16.0,...,15.7,16.3,23093,POINT (-85.79933425 33.68300103),101852,HLTHOUT,DIABETES,AgeAdjPrv,Diabetes,1


In [22]:
# Removing unnecessary columns from datasets

desired_cols = ['Year', 'StateDesc', 'LocationName', 'Measure', 'Category', 'Data_Value',
                'TotalPopulation', 'Geolocation', 'Short_Question_Text', 'FIP', 'StateAbbr']
df_2020 = df_2020[desired_cols]
df_2019 = df_2019[desired_cols]

df_2020.head()

Unnamed: 0,Year,StateDesc,LocationName,Measure,Category,Data_Value,TotalPopulation,Geolocation,Short_Question_Text,FIP,StateAbbr
0,2020,Alabama,Albertville,Stroke among adults aged >=18 years,Health Outcomes,3.6,21282,POINT (-86.21214093 34.26441274),Stroke,1,AL
1,2020,Alabama,Alexander City,Stroke among adults aged >=18 years,Health Outcomes,4.6,14972,POINT (-85.93972723 32.93023222),Stroke,1,AL
2,2020,Alabama,Alexandria,Depression among adults aged >=18 years,Health Outcomes,26.2,3917,POINT (-85.87920682 33.76747698),Depression,1,AL
3,2020,Alabama,Aliceville,Cancer (excluding skin cancer) among adults ag...,Health Outcomes,7.0,2486,POINT (-88.15202475 33.12955557),Cancer (except skin),1,AL
5,2020,Alabama,Anniston,Diagnosed diabetes among adults aged >=18 years,Health Outcomes,16.0,23093,POINT (-85.79933425 33.68300103),Diabetes,1,AL


In [23]:
# Removing NaN from Data_Value and TotalPopulatioin columns

df_2020=df_2020.dropna(subset=['Data_Value','TotalPopulation'])

In [24]:
# Creating a TotalNumber column representing the actual amount of people instead of a percentage

df_2020['TotalNumber'] = df_2020.apply(lambda row: math.ceil((row.Data_Value / 100) * int(row.TotalPopulation)), axis=1)

In [25]:
df_2020.head()

Unnamed: 0,Year,StateDesc,LocationName,Measure,Category,Data_Value,TotalPopulation,Geolocation,Short_Question_Text,FIP,StateAbbr,TotalNumber
0,2020,Alabama,Albertville,Stroke among adults aged >=18 years,Health Outcomes,3.6,21282,POINT (-86.21214093 34.26441274),Stroke,1,AL,767
1,2020,Alabama,Alexander City,Stroke among adults aged >=18 years,Health Outcomes,4.6,14972,POINT (-85.93972723 32.93023222),Stroke,1,AL,689
2,2020,Alabama,Alexandria,Depression among adults aged >=18 years,Health Outcomes,26.2,3917,POINT (-85.87920682 33.76747698),Depression,1,AL,1027
3,2020,Alabama,Aliceville,Cancer (excluding skin cancer) among adults ag...,Health Outcomes,7.0,2486,POINT (-88.15202475 33.12955557),Cancer (except skin),1,AL,175
5,2020,Alabama,Anniston,Diagnosed diabetes among adults aged >=18 years,Health Outcomes,16.0,23093,POINT (-85.79933425 33.68300103),Diabetes,1,AL,3695


In [26]:
# Creating a grouped dataframe that sums the TotalNumber column in terms of states and measures

sum_2020 = df_2020.groupby(['StateDesc', 'Short_Question_Text', 'Measure', 'StateAbbr'])['TotalNumber'].sum().reset_index()

In [27]:
sum_2020.head()

Unnamed: 0,StateDesc,Short_Question_Text,Measure,StateAbbr,TotalNumber
0,Alabama,All Teeth Lost,All teeth lost among adults aged >=65 years,AL,1073619
1,Alabama,Annual Checkup,Visits to doctor for routine checkup within th...,AL,4736982
2,Alabama,Arthritis,Arthritis among adults aged >=18 years,AL,1819705
3,Alabama,Binge Drinking,Binge drinking among adults aged >=18 years,AL,907088
4,Alabama,COPD,Chronic obstructive pulmonary disease among ad...,AL,503812


In [28]:
# pulls the longitude from the Geolocation string
def pull_lon(x):
    str1 = x.split('(', 1)[1]
    str1 = str1.replace(')', '')
    return float(str1.split(' ', 1)[0])

# pulls the latitude from the Geolocation string
def pull_lat(x):
    str1 = x.split('(', 1)[1]
    str1 = str1.replace(')', '')
    return float(str1.split(' ', 1)[1])

In [29]:
# creating a latitude and longitude column

df_2020['Longitude'] = df_2020.apply(lambda row: pull_lon(row.Geolocation), axis=1)
df_2020['Latitude'] = df_2020.apply(lambda row: pull_lat(row.Geolocation), axis=1)

In [30]:
df_2020.head()

Unnamed: 0,Year,StateDesc,LocationName,Measure,Category,Data_Value,TotalPopulation,Geolocation,Short_Question_Text,FIP,StateAbbr,TotalNumber,Longitude,Latitude
0,2020,Alabama,Albertville,Stroke among adults aged >=18 years,Health Outcomes,3.6,21282,POINT (-86.21214093 34.26441274),Stroke,1,AL,767,-86.212141,34.264413
1,2020,Alabama,Alexander City,Stroke among adults aged >=18 years,Health Outcomes,4.6,14972,POINT (-85.93972723 32.93023222),Stroke,1,AL,689,-85.939727,32.930232
2,2020,Alabama,Alexandria,Depression among adults aged >=18 years,Health Outcomes,26.2,3917,POINT (-85.87920682 33.76747698),Depression,1,AL,1027,-85.879207,33.767477
3,2020,Alabama,Aliceville,Cancer (excluding skin cancer) among adults ag...,Health Outcomes,7.0,2486,POINT (-88.15202475 33.12955557),Cancer (except skin),1,AL,175,-88.152025,33.129556
5,2020,Alabama,Anniston,Diagnosed diabetes among adults aged >=18 years,Health Outcomes,16.0,23093,POINT (-85.79933425 33.68300103),Diabetes,1,AL,3695,-85.799334,33.683001


In [31]:
popdf = pd.read_csv('populations.csv')

popdf.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,NAME,ESTIMATESBASE2020,POPESTIMATE2020,POPESTIMATE2021,NPOPCHG_2020,NPOPCHG_2021,...,NETMIG2020,NETMIG2021,RESIDUAL2020,RESIDUAL2021,RBIRTH2021,RDEATH2021,RNATURALINC2021,RINTERNATIONALMIG2021,RDOMESTICMIG2021,RNETMIG2021
0,10,0,0,0,United States,331449281,331501080,331893745,51799,392665,...,12247,244622,0,0,10.798957,10.352637,0.446319,0.737485,0.0,0.737485
1,20,1,0,0,Northeast Region,57609148,57525633,57159838,-83515,-365795,...,-79429,-333592,-1024,-1151,9.950554,10.49207,-0.541516,0.977386,-6.794897,-5.817511
2,20,2,0,0,Midwest Region,68985454,68935174,68841444,-50280,-93730,...,-47266,-85268,-718,5156,10.860566,11.058248,-0.197682,0.549222,-1.786994,-1.237772
3,20,3,0,0,South Region,126266107,126409007,127225329,142900,816322,...,134454,770076,528,-3385,11.168835,10.777476,0.391359,0.886268,5.186064,6.072332
4,20,4,0,0,West Region,78588572,78631266,78667134,42694,35868,...,4488,-106594,1214,-620,10.767153,8.94791,1.819243,0.48757,-1.84288,-1.355309


In [32]:
popdf = popdf[popdf['STATE'] != 0]
popdf = popdf[['NAME', 'ESTIMATESBASE2020']]



popdf.rename(columns={'NAME': 'StateDesc'}, inplace=True)

popdf.head()

Unnamed: 0,StateDesc,ESTIMATESBASE2020
5,Alabama,5024279
6,Alaska,733391
7,Arizona,7151502
8,Arkansas,3011524
9,California,39538223


In [33]:
df_cd = pd.merge(sum_2020, popdf, how='outer', on='StateDesc')

In [34]:
df_cd.head()

Unnamed: 0,StateDesc,Short_Question_Text,Measure,StateAbbr,TotalNumber,ESTIMATESBASE2020
0,Alabama,All Teeth Lost,All teeth lost among adults aged >=65 years,AL,1073619.0,5024279
1,Alabama,Annual Checkup,Visits to doctor for routine checkup within th...,AL,4736982.0,5024279
2,Alabama,Arthritis,Arthritis among adults aged >=18 years,AL,1819705.0,5024279
3,Alabama,Binge Drinking,Binge drinking among adults aged >=18 years,AL,907088.0,5024279
4,Alabama,COPD,Chronic obstructive pulmonary disease among ad...,AL,503812.0,5024279


In [35]:
# Saving the sum_2020 to a csv

df_cd.to_csv('grouped_2020.csv')

In [36]:
# Saving the df_2020 to a csv

df_2020.to_csv('measures_2020.csv')