In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

# Data Dictionary for Final Cleaned Data

This is data pulled for publically-funded family planning in each state. Note that not all data is from the same year. Data for number of family planning centers and total expenditures is from 2015, the remaining data is from 2016.

The raw data was pulled from the [Guttmacher institute](https://data.guttmacher.org/states/table?state=AL+AK+AZ+AR+CA+CO+CT+DE+DC+FL+GA+HI+ID+IL+IN+IA+KS+KY+LA+ME+MD+MA+MI+MN+MS+MO+MT+NE+NV+NH+NJ+NM+NY+NC+ND+OH+OK+OR+PA+RI+SC+SD+TN+TX+UT+VT+VA+WA+WV+WI+WY&dataset=data&topics=105+102+256+125) and is stored at the path "../raw_data/family_planning.csv".

The cleaned data is stored at the path "../data/family_planning.csv".

| column name | meaning |
| ----------- | ------- |
| state | U.S. State |
| perc_need_met16 | % of likely need met by publicly funded centers, 2016 |
| num_served16 | No. of female contraceptive clients younger than 20 served at publicly funded centers, 2016 |
| num_u20_served16 | No. of female contraceptive clients younger than 20 served at publicly funded centers, 2016 |
| num_centers15 | No. of publicly funded family planning centers, 2015 |
| total_expend15 | Total reported public expenditures for family planning client services in dollars, 2015 |
| STATEFP20 | state fips |

# Wrangling

## Load data

In [3]:
planning = pd.read_csv("../raw_data/family_planning.csv")
fips = pd.read_csv("../data/state_fips.csv", dtype={"STATEFP20": object}) # ensure leading 0's kept in fips

## Preview data

In [4]:
fips.head()

Unnamed: 0,STATE,STATEFP20
0,Alabama,1
1,Alaska,2
2,Arizona,4
3,Arkansas,5
4,California,6


In [5]:
planning.head()

Unnamed: 0,U.S. State,"% of likely need met by publicly funded centers, 2016","No. of female contraceptive clients served at publicly funded centers , 2016","No. of female contraceptive clients younger than 20 served at publicly funded centers, 2016","No. of publicly funded family planning centers, 2015","Total reported public expenditures for family planning client services (in 000s of dollars), 2015"
0,Alabama,28,97600,20300,187,69742
1,Alaska,54,21640,4750,157,10158
2,Arizona,19,88140,16500,232,57560
3,Arkansas,23,50960,10610,163,11397
4,California,64,1618010,252840,1697,454706


## Clean Data

In [6]:
planning.columns

Index(['U.S. State', '% of likely need met by publicly funded centers, 2016',
       'No. of female contraceptive clients served at publicly funded centers , 2016',
       'No. of female contraceptive clients younger than 20 served at publicly funded centers, 2016',
       'No. of publicly funded family planning centers, 2015',
       'Total reported public expenditures for family planning client services (in 000s of dollars), 2015'],
      dtype='object')

In [7]:
# give columns shorter names
planning.rename(columns={'U.S. State': 'state',
                        '% of likely need met by publicly funded centers, 2016': 'perc_need_met16',
                        'No. of female contraceptive clients served at publicly funded centers , 2016': 'num_served16',
                        'No. of female contraceptive clients younger than 20 served at publicly funded centers, 2016': 'num_u20_served16',
                        'No. of publicly funded family planning centers, 2015': 'num_centers15',
                        'Total reported public expenditures for family planning client services (in 000s of dollars), 2015': 'total_expend15'},
               inplace=True)

In [8]:
# drop footer rows without data
planning.drop([51, 52, 53], inplace=True)

In [9]:
# add state fips column

# check that name columns are all equal before merging
# planning["state"] == fips["STATE"]

planning = planning.merge(right=fips, left_on="state", right_on="STATE")
planning.drop(["STATE"], axis=1, inplace=True)

In [10]:
planning.head()

Unnamed: 0,state,perc_need_met16,num_served16,num_u20_served16,num_centers15,total_expend15,STATEFP20
0,Alabama,28,97600,20300,187,69742,1
1,Alaska,54,21640,4750,157,10158,2
2,Arizona,19,88140,16500,232,57560,4
3,Arkansas,23,50960,10610,163,11397,5
4,California,64,1618010,252840,1697,454706,6


In [11]:
# convert appropriate columns to numerical
planning.dtypes

state               object
perc_need_met16     object
num_served16        object
num_u20_served16    object
num_centers15       object
total_expend15      object
STATEFP20           object
dtype: object

In [12]:
cols = planning.columns.drop(["state", "STATEFP20"])

In [13]:
# get rid of commas in numbers
planning[cols] = planning[cols].replace({',': ''}, regex=True)
planning[cols] = planning[cols].apply(pd.to_numeric, errors="coerce")

In [14]:
# original data for total expenditures was in 1000's of dollars
# convert to literal dollars
planning["total_expend15"] = planning["total_expend15"] * 1000

In [15]:
planning

Unnamed: 0,state,perc_need_met16,num_served16,num_u20_served16,num_centers15,total_expend15,STATEFP20
0,Alabama,28,97600,20300,187,69742000,1
1,Alaska,54,21640,4750,157,10158000,2
2,Arizona,19,88140,16500,232,57560000,4
3,Arkansas,23,50960,10610,163,11397000,5
4,California,64,1618010,252840,1697,454706000,6
5,Colorado,35,115490,23720,221,29252000,8
6,Connecticut,38,68130,14160,92,21462000,9
7,Delaware,30,16080,4510,41,4646000,10
8,District of Columbia,88,43600,7370,31,5518000,11
9,Florida,16,214450,40050,430,42764000,12


## Save Cleaned Data

In [16]:
# planning.to_csv("../data/family_planning.csv", index=False)

In [17]:
test = pd.read_csv("../data/family_planning.csv", dtype={"STATEFP20": object})

In [18]:
test

Unnamed: 0,state,perc_need_met16,num_served16,num_u20_served16,num_centers15,total_expend15,STATEFP20
0,Alabama,28,97600,20300,187,69742000,1
1,Alaska,54,21640,4750,157,10158000,2
2,Arizona,19,88140,16500,232,57560000,4
3,Arkansas,23,50960,10610,163,11397000,5
4,California,64,1618010,252840,1697,454706000,6
5,Colorado,35,115490,23720,221,29252000,8
6,Connecticut,38,68130,14160,92,21462000,9
7,Delaware,30,16080,4510,41,4646000,10
8,District of Columbia,88,43600,7370,31,5518000,11
9,Florida,16,214450,40050,430,42764000,12
