Dataset from [cdc.gov](https://www.cdc.gov/reproductivehealth/data_stats/abortion.htm)

State of Service are the index (first column) and State of Mother's residence are the columns

In [89]:
import numpy as np
import pandas as pd
import os

In [90]:
def long_format_df(raw_df, year_of_data):
    raw_df = raw_df.iloc[0:53, 1:-1]
    raw_df.columns = list(raw_df[0:1].values)
    raw_df = raw_df.drop(0, axis=0)
    raw_df.columns = [col[0] for col in raw_df.columns]

    if 'State' in raw_df.columns:
        raw_df = raw_df.melt('State')
        raw_df.rename({'State': 'state_area_of_service', 'variable': 'state_area_of_mothers_residence', 'value': year_of_data}, axis=1, inplace=True)
    elif 'State/Area' in raw_df.columns:
        raw_df = raw_df.melt('State/Area')
        raw_df.rename({'State/Area': 'state_area_of_service', 'variable': 'state_area_of_mothers_residence', 'value': year_of_data}, axis=1, inplace=True)

    return raw_df

In [171]:
def create_dataset(filepath):
    for i, file in enumerate(os.listdir(filepath)):
        yr = file[-8:-4]
        if i == 0:
            df = long_format_df(pd.read_csv(f'{filepath}{file}'), yr)
        else:
            df = df.merge(long_format_df(pd.read_csv(f'../data/cdc/{file}'), yr), on=['state_area_of_service', 'state_area_of_mothers_residence'])

        # Strip white space and ',' from numbers for conversion from str to float
        # Replace '--' which is equivalent to NaN in the dataset
        # Replace '**' which only occurs once in the dataset in 2012 for Washington/N Carolina combination
        df[yr] = df[yr].str.strip().str.replace(',', '').replace('--', np.nan).replace('**', np.nan).astype('float64')


    return df

In [172]:
df = create_dataset('../data/cdc/')
df

Unnamed: 0,state_area_of_service,state_area_of_mothers_residence,2016,2017,2014,2010,2011,2013,2012,2015,2019,2018
0,Alabama,Alabama,5525.0,5223.0,6650.0,8587.0,7989.0,6997.0,7464.0,5124.0,4969.0,5455.0
1,Alaska,Alabama,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arizona,Alabama,0.0,0.0,0.0,0.0,,,0.0,0.0,0.0,0.0
3,Arkansas,Alabama,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,0.0
4,California**,Alabama,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2491,Virginia,Out-of-state (exact residence unknown),,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
2492,Washington,Out-of-state (exact residence unknown),,,,0.0,0.0,0.0,,0.0,0.0,0.0
2493,West Virginia,Out-of-state (exact residence unknown),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2494,Wisconsin,Out-of-state (exact residence unknown),,,,,,,,,,


In [174]:
df.to_pickle('../data/cdc_2010-2019.pkl')