In [1]:
import pandas as pd 

In [2]:
# data from 2011 - 2016, 2018-2020 with missing data
df = pd.read_csv('raw_data.csv', header=None)

In [3]:
# complement to missing data
ab_complement = pd.read_csv('missing_11_18.csv')

In [4]:
# complete data for 2014, 2016, 2017, 2019, 2020
ab_14_16_17 = pd.read_csv('abortion-incidence-service-availability-us-2017-tables.csv')
ab_19_20 = pd.read_csv('Copy of abortion_provider_census_2020_topline.csv')


In [5]:
ab_complement.head()

Unnamed: 0,state,11,12,13,15,18
0,California,22.99,21.62,20.59,18.5,17.61
1,New Hampshire,12.9,11.11,9.46,9.9,8.8
2,Maryland,28.76,26.48,24.63,24.43,25.34
3,Wyoming,1.13,1.2,1.28,1.0,1.12


In [6]:
# change ab_complement column names
ab_complement.columns = ['state', 2011, 2012, 2013, 2015, 2018]

In [7]:
# change df column names
col = ['state']
col.extend(range(2011,2017))
col.extend(range(2018,2021))
df.columns = col

In [8]:
# from df, only select the years without complete data, which are 2011-2013, 2015, 2018
ab_rest_years = df.drop(columns=[2014, 2016, 2019, 2020])

In [9]:
# drop missing states
ab_rest_years = ab_rest_years[~ab_rest_years['state'].isin(['California', 'Maryland', 'New Hampshire', 'Wyoming'])]

In [10]:
# add complement data 
ab_rest_years = pd.concat([ab_rest_years, ab_complement]).reset_index(drop=True)

In [11]:
# clean ab_14_16_17
ab_14_16_17.head()

Unnamed: 0,Region and state,2014,2016,2017,Unnamed: 4
0,Connecticut,19.2,18.1,17.7,
1,Maine,9.5,8.9,8.8,
2,Massachusetts,15.3,14.0,13.5,
3,New Hampshire,10.4,9.6,9.2,
4,New Jersey,25.8,28.2,28.0,


In [12]:
# change ab_14_16_17 column names
ab_14_16_17.rename(columns={'Region and state': 'state'}, inplace=True)

In [13]:
# drop null col
ab_14_16_17 = ab_14_16_17.drop(ab_14_16_17.columns[4], axis=1)

In [14]:
ab_rate = pd.merge(ab_14_16_17, ab_19_20, on='state')
ab_rate = pd.merge(ab_rate, ab_rest_years, on='state')

In [15]:
ab_rate

Unnamed: 0,state,2014,2016,2017,2019,2020,2011,2012,2013,2015,2018
0,Connecticut,19.2,18.1,17.7,17.9,16.7,18.4,17.4,15.4,14.6,13.9
1,Maine,9.5,8.9,8.8,9.0,10.1,7.4,4.2,8.3,7.9,8.4
2,Massachusetts,15.3,14.0,13.5,13.6,12.2,15.4,14.8,14.3,13.5,13.1
3,New Hampshire,10.4,9.6,9.2,8.5,8.3,12.9,11.11,9.46,9.9,8.8
4,New Jersey,25.8,28.2,28.0,28.8,29.2,15.4,13.3,12.6,13.4,13.6
5,New York,29.6,27.6,26.3,30.2,28.8,27.6,25.8,24.3,23.1,19.8
6,Pennsylvania,13.3,13.5,13.1,13.1,13.6,14.9,14.2,13.3,13.3,12.7
7,Rhode Island,17.0,16.8,16.7,13.6,13.3,19.6,16.8,15.5,12.6,13.5
8,Vermont,12.1,12.0,11.4,10.4,10.7,11.5,10.9,10.5,10.9,10.5
9,Illinois,16.3,16.4,16.6,20.9,21.3,15.8,16.5,15.7,15.5,16.9


In [16]:
# convert dataframe to the form where year is a column
ab_rate = ab_rate.melt(id_vars=['state'], var_name='year', value_name='ab_rate')

In [17]:
# drop DC rows
ab_rate = ab_rate[ab_rate['state']!='District of Columbia']

In [18]:
ab_rate['ab_rate'] = ab_rate['ab_rate'].astype(float)

In [19]:
ab_rate.to_csv(r'..\..\cleaned_data\ab_rate.csv', index=False)