# Data Cleaned

In [45]:
import pandas as pd
import numpy as np

df_vacc = pd.read_csv('child_vaccination_data.csv')
df_vacc.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,>400% FPL,64.2,55.3 to 73.1,189.0
1,Hep A,≥1 Dose,States/Local Areas,Tennessee,2016-2019,Race and Ethnicity,Hispanic,86.4,76.8 to 95.9,187.0
2,PCV,≥4 Doses,States/Local Areas,Tennessee,2016-2019,Insurance Coverage,Private Insurance Only,89.3,85.4 to 93.2,612.0
3,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,133% to <400% FPL,51.4,44.0 to 58.8,342.0
4,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,<133% FPL,39.0,31.9 to 46.1,298.0


In [46]:
# pivot 'Dimension Type' into new columns (with 'Dimension' containing values for new columns)
df_vacc = pd.pivot(df_vacc, 
                   index=['Vaccine', 'Dose', 'Geography Type', 'Geography', 'Birth Year/Birth Cohort', 'Dimension', 'Estimate (%)', '95% CI (%)', 'Sample Size'], # unique identifier for each row
                   columns=['Dimension Type'], # column containing new column names
                   values='Dimension')         # values for new columns

df_vacc.reset_index(inplace=True) 
df_vacc.rename_axis(None, axis=1, inplace=True) # remove the named index

In [47]:
# drop 'Dimension' column as it is no longer needed after pivot
df_vacc.drop(columns='Dimension', inplace = True)

In [48]:
# split 'Birth Year/Birth Cohort' column into two columns by looking for hyphen in values
df_vacc['Birth Year'] = np.where(df_vacc['Birth Year/Birth Cohort'].str.contains('-'), np.nan, df_vacc['Birth Year/Birth Cohort'])
df_vacc['Birth Cohort'] = np.where(df_vacc['Birth Year/Birth Cohort'].str.contains('-'), df_vacc['Birth Year/Birth Cohort'], np.nan)

In [49]:
# drop 'Birth Year/Birth Cohort' column as it is no longer needed after split
df_vacc.drop(columns='Birth Year/Birth Cohort', inplace = True)

In [50]:
# reorder columns
df_vacc = df_vacc[['Vaccine', 'Dose', 'Geography Type', 'Geography', 'Birth Year', 'Birth Cohort', 'Estimate (%)', '95% CI (%)', 'Overall', 'Age', 'Race and Ethnicity', 'Poverty', 'Insurance Coverage', 'Urbanicity', 'Sample Size']]

In [54]:
df_vacc.sample(10)

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year,Birth Cohort,Estimate (%),95% CI (%),Overall,Age,Race and Ethnicity,Poverty,Insurance Coverage,Urbanicity,Sample Size
60867,Hib,≥1 Dose,States/Local Areas,Alabama,,2018-2019,89.0,84.3 to 92.5,,3 Months,,,,,459.0
117800,Varicella,≥1 Dose,States/Local Areas,Utah,2015.0,,58.8,50.8 to 66.3,,13 Months,,,,,241.0
63402,Hib,≥2 Doses,States/Local Areas,Iowa,2015.0,,94.3,89.2 to 97.1,,13 Months,,,,,238.0
7605,DTaP,≥3 Doses,HHS Regions/National,United States,2020.0,,92.7,91.6 to 93.6,,19 Months,,,,,10468.0
76297,MMR,≥1 Dose,HHS Regions/National,Region 3,2020.0,,88.8,85.6 to 91.3,,19 Months,,,,,1457.0
90877,PCV,≥3 Doses,States/Local Areas,PA-Philadelphia,2012.0,,87.3,85.5 to 88.8,,13 Months,,,,,280.0
90920,PCV,≥3 Doses,States/Local Areas,PA-Philadelphia,,2015-2016,94.5,91.3 to 96.6,,19 Months,,,,,444.0
116647,Varicella,≥1 Dose,States/Local Areas,PA-Philadelphia,2019.0,,88.4,81.5 to 92.9,,19 Months,,,,,235.0
102826,Polio,≥2 Doses,States/Local Areas,TX-City of Houston,2012.0,,76.3,69.4 to 82.1,,5 Months,,,,,244.0
17980,DTaP,≥4 Doses,States/Local Areas,New York,2015.0,,80.6,75.7 to 85.0,,24 Months,,,,,503.0


In [55]:
# saved cleaned data to csv
df_vacc.to_csv('child_vaccination_data_cleaned.csv', index=False)