# Data Cleaned

In [126]:
import pandas as pd

df_vacc = pd.read_csv('child_vaccination_data.csv')
df_vacc.head()

Unnamed: 0,Vaccine,Dose,Geography Type,Geography,Birth Year/Birth Cohort,Dimension Type,Dimension,Estimate (%),95% CI (%),Sample Size
0,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,>400% FPL,64.2,55.3 to 73.1,189.0
1,Hep A,≥1 Dose,States/Local Areas,Tennessee,2016-2019,Race and Ethnicity,Hispanic,86.4,76.8 to 95.9,187.0
2,PCV,≥4 Doses,States/Local Areas,Tennessee,2016-2019,Insurance Coverage,Private Insurance Only,89.3,85.4 to 93.2,612.0
3,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,133% to <400% FPL,51.4,44.0 to 58.8,342.0
4,Influenza,,States/Local Areas,Tennessee,2014-2017,Poverty,<133% FPL,39.0,31.9 to 46.1,298.0


In [127]:
# clean up column names
df_vacc.rename(columns={c: c.lower().replace(" ", "_") for c in df_vacc.columns}, inplace=True)

In [128]:
# rename some columns
df_vacc.rename(columns={'birth_year/birth_cohort': 'birth_year_cohort', 'estimate_(%)': 'percent_estimate', '95%_ci_(%)': 'percent_95_CI'}, inplace=True)

In [129]:
# pivot 'dimension_type' into new columns (with 'dimension' containing values for new columns)
df_vacc = pd.pivot(df_vacc, 
                   index=['vaccine', 'dose', 'geography_type', 'geography', 'birth_year_cohort', 'dimension', 'percent_estimate', 'percent_95_CI', 'sample_size'], # unique identifier for each row
                   columns=['dimension_type'], # column containing new column names
                   values='dimension')         # values for new columns

df_vacc.reset_index(inplace=True) 
df_vacc.rename_axis(None, axis=1, inplace=True) # remove the named index

In [130]:
# drop 'dimension' column as it is no longer needed after pivot
df_vacc.drop(columns='dimension', inplace = True)

In [131]:
df_vacc.sample(15)

Unnamed: 0,vaccine,dose,geography_type,geography,birth_year_cohort,percent_estimate,percent_95_CI,sample_size,Age,Insurance Coverage,Overall,Poverty,Race and Ethnicity,Urbanicity
15502,DTaP,≥4 Doses,States/Local Areas,California,2017,86.2,79.1 to 91.8,418.0,35 Months,,,,,
50800,Hib,Full Series,States/Local Areas,Idaho,2016-2019,84.7,81.0 to 88.5,741.0,,Private Insurance Only,,,,
109524,Rotavirus,,States/Local Areas,Georgia,2016-2019,87.4,81.5 to 93.3,367.0,,,,>400% FPL,,
43630,Hep B,≥3 Doses,States/Local Areas,IL-Rest of state,2012,46.7,43.5 to 49.9,330.0,7 Months,,,,,
46296,Hep B,≥3 Doses,States/Local Areas,North Carolina,2011,93.1,87.8 to 96.7,252.0,35 Months,,,,,
52949,Hib,Full Series,States/Local Areas,PA-Philadelphia,2016-2017,78.3,72.9 to 82.8,430.0,19 Months,,,,,
31400,Hep B,≥1 Dose,States/Local Areas,Missouri,2018-2019,96.6,94.4 to 98.0,818.0,3 Months,,,,,
57207,Hib,Primary Series,States/Local Areas,Louisiana,2019,93.4,88.4 to 96.8,411.0,24 Months,,,,,
115691,Varicella,≥1 Dose,States/Local Areas,Nevada,2016-2019,92.6,88.1 to 97.1,200.0,,,,,"Other or Multiple Races, Non-Hispanic",
17858,DTaP,≥4 Doses,States/Local Areas,New Jersey,2018,81.0,73.8 to 87.2,269.0,24 Months,,,,,


In [132]:
# saved cleaned data to csv
df_vacc.to_csv('child_vaccination_data_cleaned.csv', index=False)