In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

# Data Dictionary for Final Cleaned Data

This is data pulled for unplanned births / unintended pregnancies and public funding for these in 2010.

The raw data was pulled from the [Guttmacher Institutue](https://data.guttmacher.org/states/table?state=AL+AK+AZ+AR+CA+CO+CT+DE+DC+FL+GA+HI+ID+IL+IN+IA+KS+KY+LA+ME+MD+MA+MI+MN+MS+MO+MT+NE+NV+NH+NJ+NM+NY+NC+ND+OH+OK+OR+PA+RI+SC+SD+TN+TX+UT+VT+VA+WA+WV+WI+WY&dataset=data&topics=166) and is stored at path "../raw_data/unplanned_births.csv".

The cleaned data is stored at the path "../data/unplanned_births.csv".

| column name | meaning |
| ----------- | ------- |
| state | U.S. State |
| perc_funded | % of unplanned births that were publicly funded, 2010 |
| fed_cost | Federal costs for unintended pregnancies (in millions of dollars), 2010 |
| num_funded | No. of publicly funded unplanned birth, 2010 |
| indv_cost | Public costs per woman 15-44 for unintended pregnancies (in dollars), 2010 |
| state_cost | State-level costs for unintended pregnancies (in millions of dollars), 2010 |
| total_cost | Total public costs for unintended pregnancies (in millions of dollars) , 2010 |
| year | Year data is from |
| STATEFP20 | State FIPS |

Note that for Arizona, the District of Columbia, Indiana, Kansas, Montana, Nevada, New Hampshire, North Dakota, and South Dakota, the number of unplanned births and the proportion of planned and unplanned births that were publicly funded were estimated by regression analyses.

# Wrangling

## Load Data

In [3]:
births = pd.read_csv("../raw_data/unplanned_births.csv")
fips = pd.read_csv("../data/state_fips.csv", dtype={"STATEFP20": object}) # ensure leading 0's kept in fips

## Preview Data

In [4]:
fips.head()

Unnamed: 0,STATE,STATEFP20
0,Alabama,1
1,Alaska,2
2,Arizona,4
3,Arkansas,5
4,California,6


In [5]:
births

Unnamed: 0,U.S. State,"% of unplanned births that were publicly funded, 2010","Federal costs for unintended pregnancies (in millions of dollars), 2010","No. of publicly funded unplanned birth, 2010","Public costs per woman 15-44 for unintended pregnancies (in dollars), 2010","State-level costs for unintended pregnancies (in millions of dollars), 2010","Total public costs for unintended pregnancies (in millions of dollars) , 2010"
0,Alabama,61.6,250.5,18200,336,72.6,323.2
1,Alaska,64.3,70.8,3000,790,42.9,113.7
2,Arizona,64.6 [1],509.4 [1],"24,200 [1]",531 [1],161.5 [1],670.9 [1]
3,Arkansas,72.3,266.8,13800,576,61.9,328.7
4,California,64.3,1062,105300,222,689.3,1751
5,Colorado,63.8,146.1,15100,231,91.1,237.3
6,Connecticut,60.8,128.4,7900,301,80.1,208.5
7,Delaware,71.3,58.2,3300,526,36,94.2
8,District of Columbia,84.6 [1],50.9 [1],"3,700 [1]",393 [1],13.3 [1],64.1 [1]
9,Florida,70.6,892.8,71400,371,427.1,1320


## Clean Data

In [6]:
births.columns

Index(['U.S. State', '% of unplanned births that were publicly funded, 2010',
       'Federal costs for unintended pregnancies (in millions of dollars), 2010',
       'No. of publicly funded unplanned birth, 2010',
       'Public costs per woman 15-44 for unintended pregnancies (in dollars), 2010',
       'State-level costs for unintended pregnancies (in millions of dollars), 2010',
       'Total public costs for unintended pregnancies (in millions of dollars) , 2010'],
      dtype='object')

In [7]:
# give columns shorter names
births.rename(columns={'U.S. State': 'state',
                      '% of unplanned births that were publicly funded, 2010': 'perc_funded',
                      'Federal costs for unintended pregnancies (in millions of dollars), 2010': 'fed_cost',
                      'No. of publicly funded unplanned birth, 2010': 'num_funded',
                      'Public costs per woman 15-44 for unintended pregnancies (in dollars), 2010': 'indv_cost',
                      'State-level costs for unintended pregnancies (in millions of dollars), 2010': 'state_cost',
                      'Total public costs for unintended pregnancies (in millions of dollars) , 2010': 'total_cost'},
             inplace=True)

In [8]:
# drop footer rows without data
births.drop([51, 52, 53], inplace=True)

In [9]:
# add year
births["year"] = 2010

In [10]:
# add state fips column

# check that name columns are all equal before merging
# births["state"] == fips["STATE"]

births = births.merge(right=fips, left_on="state", right_on="STATE")
births.drop(["STATE"], axis=1, inplace=True)

In [11]:
births

Unnamed: 0,state,perc_funded,fed_cost,num_funded,indv_cost,state_cost,total_cost,year,STATEFP20
0,Alabama,61.6,250.5,18200,336,72.6,323.2,2010,1
1,Alaska,64.3,70.8,3000,790,42.9,113.7,2010,2
2,Arizona,64.6 [1],509.4 [1],"24,200 [1]",531 [1],161.5 [1],670.9 [1],2010,4
3,Arkansas,72.3,266.8,13800,576,61.9,328.7,2010,5
4,California,64.3,1062,105300,222,689.3,1751,2010,6
5,Colorado,63.8,146.1,15100,231,91.1,237.3,2010,8
6,Connecticut,60.8,128.4,7900,301,80.1,208.5,2010,9
7,Delaware,71.3,58.2,3300,526,36,94.2,2010,10
8,District of Columbia,84.6 [1],50.9 [1],"3,700 [1]",393 [1],13.3 [1],64.1 [1],2010,11
9,Florida,70.6,892.8,71400,371,427.1,1320,2010,12


In [12]:
cols = births.columns.drop(["state", "year", "STATEFP20"])

In [13]:
# get rid of commas, footnotes
births[cols] = births[cols].replace({',': '', ' \[1\]': ''}, regex=True)

In [14]:
births

Unnamed: 0,state,perc_funded,fed_cost,num_funded,indv_cost,state_cost,total_cost,year,STATEFP20
0,Alabama,61.6,250.5,18200,336,72.6,323.2,2010,1
1,Alaska,64.3,70.8,3000,790,42.9,113.7,2010,2
2,Arizona,64.6,509.4,24200,531,161.5,670.9,2010,4
3,Arkansas,72.3,266.8,13800,576,61.9,328.7,2010,5
4,California,64.3,1062.0,105300,222,689.3,1751.0,2010,6
5,Colorado,63.8,146.1,15100,231,91.1,237.3,2010,8
6,Connecticut,60.8,128.4,7900,301,80.1,208.5,2010,9
7,Delaware,71.3,58.2,3300,526,36.0,94.2,2010,10
8,District of Columbia,84.6,50.9,3700,393,13.3,64.1,2010,11
9,Florida,70.6,892.8,71400,371,427.1,1320.0,2010,12


In [15]:
births[cols] = births[cols].apply(pd.to_numeric, errors="coerce")

In [16]:
births.head()

Unnamed: 0,state,perc_funded,fed_cost,num_funded,indv_cost,state_cost,total_cost,year,STATEFP20
0,Alabama,61.6,250.5,18200,336,72.6,323.2,2010,1
1,Alaska,64.3,70.8,3000,790,42.9,113.7,2010,2
2,Arizona,64.6,509.4,24200,531,161.5,670.9,2010,4
3,Arkansas,72.3,266.8,13800,576,61.9,328.7,2010,5
4,California,64.3,1062.0,105300,222,689.3,1751.0,2010,6


In [17]:
births.head()

Unnamed: 0,state,perc_funded,fed_cost,num_funded,indv_cost,state_cost,total_cost,year,STATEFP20
0,Alabama,61.6,250.5,18200,336,72.6,323.2,2010,1
1,Alaska,64.3,70.8,3000,790,42.9,113.7,2010,2
2,Arizona,64.6,509.4,24200,531,161.5,670.9,2010,4
3,Arkansas,72.3,266.8,13800,576,61.9,328.7,2010,5
4,California,64.3,1062.0,105300,222,689.3,1751.0,2010,6


## Save Cleaned Data

In [18]:
# births.to_csv("../data/unplanned_births.csv", index=False)

In [19]:
test = pd.read_csv("../data/unplanned_births.csv", dtype={"STATEFP20": object})

In [20]:
test

Unnamed: 0,state,perc_funded,fed_cost,num_funded,indv_cost,state_cost,total_cost,year,STATEFP20
0,Alabama,61.6,250.5,18200,336,72.6,323.2,2010,1
1,Alaska,64.3,70.8,3000,790,42.9,113.7,2010,2
2,Arizona,64.6,509.4,24200,531,161.5,670.9,2010,4
3,Arkansas,72.3,266.8,13800,576,61.9,328.7,2010,5
4,California,64.3,1062.0,105300,222,689.3,1751.0,2010,6
5,Colorado,63.8,146.1,15100,231,91.1,237.3,2010,8
6,Connecticut,60.8,128.4,7900,301,80.1,208.5,2010,9
7,Delaware,71.3,58.2,3300,526,36.0,94.2,2010,10
8,District of Columbia,84.6,50.9,3700,393,13.3,64.1,2010,11
9,Florida,70.6,892.8,71400,371,427.1,1320.0,2010,12
