# Data Cleaned

In [2]:
import numpy as np
import pandas as pd

In [16]:
df_vacc = pd.read_csv('child_vaccination_data.csv')
num_rows_original = df_vacc.shape[0]
df_vacc.shape

(118423, 10)

In [17]:
df_vacc.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,>400% FPL,64.2,55.3 to 73.1,189.0
1,Hep A,≥1 Dose,States/Local Areas,Tennessee,2016-2019,Race and Ethnicity,Hispanic,86.4,76.8 to 95.9,187.0
2,PCV,≥4 Doses,States/Local Areas,Tennessee,2016-2019,Insurance Coverage,Private Insurance Only,89.3,85.4 to 93.2,612.0
3,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,133% to <400% FPL,51.4,44.0 to 58.8,342.0
4,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,<133% FPL,39.0,31.9 to 46.1,298.0


In [18]:
# pivot 'Dimension Type' into new columns (with 'Dimension' containing values for new columns)
df_vacc = pd.pivot(df_vacc, 
                   index=['Vaccine', 'Dose', 'Geography Type', 'Geography', 'Birth Year/Birth Cohort', \
                          'Dimension', 'Estimate (%)', '95% CI (%)', 'Sample Size'], # identifier for each row
                   columns=['Dimension Type'], # column containing new column names
                   values='Dimension')         # values for new columns

df_vacc.reset_index(inplace=True) 
df_vacc.rename_axis(None, axis=1, inplace=True) # remove the named index

In [19]:
# drop 'Dimension' column as it is no longer needed after pivot
df_vacc.drop(columns='Dimension', inplace = True)

In [20]:
# split 'Birth Year/Birth Cohort' column into two columns by looking for hyphen in values
df_vacc['Birth Year'] = np.where(df_vacc['Birth Year/Birth Cohort']\
                                 .str.contains('-'), np.nan, df_vacc['Birth Year/Birth Cohort'])
df_vacc['Birth Cohort'] = np.where(df_vacc['Birth Year/Birth Cohort']\
                                   .str.contains('-'), df_vacc['Birth Year/Birth Cohort'], np.nan)

In [21]:
# drop 'Birth Year/Birth Cohort' column as it is no longer needed after split
df_vacc.drop(columns='Birth Year/Birth Cohort', inplace = True)

In [22]:
# reorder columns
df_vacc = df_vacc[['Vaccine', 'Dose', 'Geography Type', 'Geography', 'Birth Year', 'Birth Cohort', \
                   'Estimate (%)', '95% CI (%)', 'Overall', 'Age', 'Race and Ethnicity', 'Poverty', \
                    'Insurance Coverage', 'Urbanicity', 'Sample Size']]

In [23]:
# check for NA in Estimate (%) which will be column used in all data visualizations
df_vacc['Estimate (%)'].isna().sum()

58

In [24]:
# drop rows with NA in Estimate (%) which will be column used in all data visualizations
df_vacc.dropna(subset=['Estimate (%)'], inplace=True)

num_rows_current = df_vacc.shape[0]

print(num_rows_original - num_rows_current, "rows dropped")

58 rows dropped


In [28]:
# fill NA values for Dose for certain vaccines (based on CDC documentation of what these represent)
df_vacc['Dose'] = np.where(df_vacc['Vaccine'] == 'Combined 7 Series', 'Full Series', df_vacc['Dose'])
df_vacc['Dose'] = np.where(df_vacc['Vaccine'] == 'Influenza', '≥2 Doses at least 24 days apart', df_vacc['Dose'])
df_vacc['Dose'] = np.where(df_vacc['Vaccine'] == 'Rotavirus', 'Full Series', df_vacc['Dose'])

In [30]:
#df_vacc[(df_vacc['Vaccine'] == 'Influenza')]
df_vacc.sample(10)

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year,Birth Cohort,Estimate (%),95% CI (%),Overall,Age,Race and Ethnicity,Poverty,Insurance Coverage,Urbanicity,Sample Size
52798,Hib,Full Series,States/Local Areas,Oklahoma,2016.0,,79.8,73.8 to 85.1,,35 Months,,,,,319.0
65039,Hib,≥2 Doses,States/Local Areas,TX-City of Houston,2013.0,,86.0,80.8 to 89.9,,7 Months,,,,,205.0
113006,Varicella,≥1 Dose,States/Local Areas,California,2017.0,,82.8,75.7 to 88.2,,19 Months,,,,,418.0
31618,Hep B,≥1 Dose,States/Local Areas,Oklahoma,2017.0,,94.2,89.2 to 97.0,,3 Months,,,,,243.0
95654,PCV,≥4 Doses,States/Local Areas,Minnesota,,2014-2017,87.0,83.1 to 90.9,,,"White, Non-Hispanic",,,,605.0
107872,Polio,≥3 Doses,States/Local Areas,TX-Bexar County,,2019-2020,90.4,85.7 to 94.1,,24 Months,,,,,343.0
89302,PCV,≥3 Doses,States/Local Areas,Missouri,,2018-2019,86.1,81.9 to 89.4,,19 Months,,,,,818.0
67987,Hib,≥3 Doses,States/Local Areas,Illinois,2018.0,,90.9,87.0 to 94.0,,24 Months,,,,,542.0
102326,Polio,≥2 Doses,States/Local Areas,Ohio,,2016-2017,85.3,80.3 to 89.2,,7 Months,,,,,566.0
86027,PCV,≥3 Doses,HHS Regions/National,United States,2013.0,,88.2,87.2 to 89.2,,13 Months,,,,,14872.0


In [77]:
# saved cleaned data to csv
df_vacc.to_csv('child_vaccination_data_cleaned.csv', index=False)