In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_columns', None)

# Data Dictionary for Final Cleaned Data

This is data for % of women using particular forms of contraceptives in each state.

The raw data was pulled from the Guttmacher [institute](https://data.guttmacher.org/states/table?state=AL+AK+AZ+AR+CA+CO+CT+DE+DC+FL+GA+HI+ID+IL+IN+IA+KS+KY+LA+ME+MD+MA+MI+MN+MS+MO+MT+NE+NV+NH+NJ+NM+NY+NC+ND+OH+OK+OR+PA+RI+SC+SD+TN+TX+UT+VT+VA+WA+WV+WI+WY&dataset=data&topics=283+282#footnotes) and is stored at path "../raw_data/contraceptive_use.csv".

The cleaned data is stored at the path "../data/contraceptive_use.csv".

| column name | meaning |
| ----------- | ------- |
| state | U.S. state |
| using | % of women aged 18-49 using contraceptives, 2017 [1] |
| any | % of women at risk of unintended pregnancy using contraception, 2017 [3, 2] |
| fem_steril | % of women at risk of unintended pregnancy relying on female sterilization, 2017 [3, 2] |
| male_steril | % of women at risk of unintended pregnancy relying on male sterilization, 2017 [3, 2] |
| implant | % of women at risk of unintended pregnancy relying on the contraceptive implant, 2017 [3, 2] |
| iud | % of women at risk of unintended pregnancy relying on the IUD, 2017 [3, 2] |
| pill | % of women at risk of unintended pregnancy relying on the pill, 2017 [3, 2] |
| nonlarc_horm | % of women at risk of unintended pregnancy relying on other non-LARC hormonal methods, 2017 [3, 2] |
| condoms | % of women at risk of unintended pregnancy relying on condoms, 2017 [4, 3, 2] |
| withdrawal | % of women at risk of unintended pregnancy relying on withdrawal, 2017 [3, 2] |
| other | % of women at risk of unintended pregnancy relying on other methods of contraception, 2017 [5, 3, 2] |
| none | % of women at risk of unintended pregnancy not using a contraceptive method, 2017 [3, 2] |
| year | year data is from |
| STATEFP20 | state fips |
| [1] | Data are available for the 38 states, plus the District of Columbia, that asked key questions regarding women's reproductive health as part of the Behavioral Risk Factor Surveillance System in 2017. Data are also available for Puerto Rico in the full report. |
| [2] | Women were considered at risk of unintended pregnancy if they are aged 18–49, sexually active with one or more male partners, were not currently pregnant or trying to become pregnant, and had not had a hysterectomy. |
| [3] | These data are available for the 38 states, plus the District of Columbia, that asked key questions regarding women's reproductive health as part of the 2017 Behavioral Risk Factor Surveillance System. Data for Puerto Rico are included in the full report. |
| [4] | "Other non-LARC hormonal methods" are injectables, patches and vaginal rings. |
| [5] | "Other contraceptive methods" are emergency contraception, diaphragms, cervical rings, cervical caps, rhythm method, natural family planning and spermicidal methods. |
| Some NaN is from entires labelled with [6] | Estimate is unreliable (has a denominator of less than 50 respondents or a relative standard error greater than 30%). |

# Wrangling

## Load Data

In [3]:
contraceptives = pd.read_csv("../raw_data/contraceptive_use.csv")
fips = pd.read_csv("../data/state_fips.csv", dtype={"STATEFP20": object}) # ensure leading 0's kept in fips

## Preview Data

In [4]:
fips.head()

Unnamed: 0,STATE,STATEFP20
0,Alabama,1
1,Alaska,2
2,Arizona,4
3,Arkansas,5
4,California,6


In [5]:
contraceptives

Unnamed: 0,U.S. State,"% of women aged 18-49 using contraceptives, 2017 [1]","% of women at risk of unintended pregnancy using contraception, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on female sterilization, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on male sterilization, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on the contraceptive implant, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on the IUD, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on the pill, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on other non-LARC hormonal methods, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on condoms, 2017 [4, 3, 2]","% of women at risk of unintended pregnancy relying on withdrawal, 2017 [3, 2]","% of women at risk of unintended pregnancy relying on other methods of contraception, 2017 [5, 3, 2]","% of women at risk of unintended pregnancy not using a contraceptive method, 2017 [3, 2]"
0,Alabama,71.5,75.9,19.3,4.7,u [6],5.4,20.6,u [6],17.1,u [6],4.2,24.1
1,Alaska,76.5,82.7,10.6,u [6],u [6],20.6,10.2,u [6],24.2,u [6],u [6],17.3
2,Arizona,64.2,67.4,12.3,4.1,2.2,9.3,14.7,u [6],16.5,1.1,6.3,32.6
3,Arkansas,,,,,,,,,,,,
4,California,72,76.6,9.1,1.3,3.5,9.4,19.7,3.1,22.7,2.3,5.4,23.4
5,Colorado,,,,,,,,,,,,
6,Connecticut,68.8,72.8,7.4,3.5,2.4,8.4,21.9,1.5,20.6,u [6],6.1,27.2
7,Delaware,62.5,66.2,14.6,u [6],6.4,6.7,13.4,u [6],14,u [6],6.1,33.8
8,District of Columbia,65.5,71.1,3.2,u [6],u [6],9.2,13.5,u [6],28.9,2.9,8.6,28.9
9,Florida,67.9,72.6,12.1,3.1,5,7.6,15.7,1.7,20.3,0.8,6.3,27.4


## Clean Data

In [6]:
# give columns shorter names
contraceptives.rename(columns={"U.S. State": "state", 
                               "% of women aged 18-49 using contraceptives, 2017 [1]": "using",
                               "% of women at risk of unintended pregnancy using contraception, 2017 [3, 2]": "any",
                               "% of women at risk of unintended pregnancy relying on female sterilization, 2017 [3, 2]": "fem_steril",
                               "% of women at risk of unintended pregnancy relying on male sterilization, 2017 [3, 2]": "male_steril",
                               "% of women at risk of unintended pregnancy relying on the contraceptive implant, 2017 [3, 2]": "implant",
                               "% of women at risk of unintended pregnancy relying on the IUD, 2017 [3, 2]": "iud",
                               "% of women at risk of unintended pregnancy relying on the pill, 2017 [3, 2]": "pill",
                               "% of women at risk of unintended pregnancy relying on other non-LARC hormonal methods, 2017 [3, 2]": "nonlarc_horm",
                               "% of women at risk of unintended pregnancy relying on condoms, 2017 [4, 3, 2]": "condoms",
                               "% of women at risk of unintended pregnancy relying on withdrawal, 2017 [3, 2]": "withdrawal",
                               "% of women at risk of unintended pregnancy relying on other methods of contraception, 2017 [5, 3, 2]": "other",
                               "% of women at risk of unintended pregnancy not using a contraceptive method, 2017 [3, 2]": "none"
                              },
                     inplace=True)

In [7]:
# add a column indicating year of data
contraceptives["year"] = 2017

In [8]:
# drop footer rows without data
contraceptives.drop([51, 52, 53], inplace=True)

In [9]:
# add state fips column

# check that name columns are all equal before merging
# contraceptives["state"] == fips["STATE"]

contraceptives = contraceptives.merge(right=fips, left_on="state", right_on="STATE")
contraceptives.drop(["STATE"], axis=1, inplace=True)

In [10]:
contraceptives.head()

Unnamed: 0,state,using,any,fem_steril,male_steril,implant,iud,pill,nonlarc_horm,condoms,withdrawal,other,none,year,STATEFP20
0,Alabama,71.5,75.9,19.3,4.7,u [6],5.4,20.6,u [6],17.1,u [6],4.2,24.1,2017,1
1,Alaska,76.5,82.7,10.6,u [6],u [6],20.6,10.2,u [6],24.2,u [6],u [6],17.3,2017,2
2,Arizona,64.2,67.4,12.3,4.1,2.2,9.3,14.7,u [6],16.5,1.1,6.3,32.6,2017,4
3,Arkansas,,,,,,,,,,,,,2017,5
4,California,72.0,76.6,9.1,1.3,3.5,9.4,19.7,3.1,22.7,2.3,5.4,23.4,2017,6


In [11]:
# set all unavailable ("u [6]") and not applicable ("n/a [6]") to NaN
contraceptives.dtypes

state           object
using           object
any             object
fem_steril      object
male_steril     object
implant         object
iud             object
pill            object
nonlarc_horm    object
condoms         object
withdrawal      object
other           object
none            object
year             int64
STATEFP20       object
dtype: object

In [12]:
cols = contraceptives.columns.drop(["state", "STATEFP20"])

In [13]:
contraceptives[cols] = contraceptives[cols].apply(pd.to_numeric, errors="coerce")

In [16]:
contraceptives

Unnamed: 0,state,using,any,fem_steril,male_steril,implant,iud,pill,nonlarc_horm,condoms,withdrawal,other,none,year,STATEFP20
0,Alabama,71.5,75.9,19.3,4.7,,5.4,20.6,,17.1,,4.2,24.1,2017,1
1,Alaska,76.5,82.7,10.6,,,20.6,10.2,,24.2,,,17.3,2017,2
2,Arizona,64.2,67.4,12.3,4.1,2.2,9.3,14.7,,16.5,1.1,6.3,32.6,2017,4
3,Arkansas,,,,,,,,,,,,,2017,5
4,California,72.0,76.6,9.1,1.3,3.5,9.4,19.7,3.1,22.7,2.3,5.4,23.4,2017,6
5,Colorado,,,,,,,,,,,,,2017,8
6,Connecticut,68.8,72.8,7.4,3.5,2.4,8.4,21.9,1.5,20.6,,6.1,27.2,2017,9
7,Delaware,62.5,66.2,14.6,,6.4,6.7,13.4,,14.0,,6.1,33.8,2017,10
8,District of Columbia,65.5,71.1,3.2,,,9.2,13.5,,28.9,2.9,8.6,28.9,2017,11
9,Florida,67.9,72.6,12.1,3.1,5.0,7.6,15.7,1.7,20.3,0.8,6.3,27.4,2017,12


## Save Cleaned Data

In [17]:
# contraceptives.to_csv("../data/contraceptive_use.csv", index=False)

In [22]:
test = pd.read_csv("../data/contraceptive_use.csv", dtype={"STATEFP20": object})

In [23]:
test.head()

Unnamed: 0,state,using,any,fem_steril,male_steril,implant,iud,pill,nonlarc_horm,condoms,withdrawal,other,none,year,STATEFP20
0,Alabama,71.5,75.9,19.3,4.7,,5.4,20.6,,17.1,,4.2,24.1,2017,1
1,Alaska,76.5,82.7,10.6,,,20.6,10.2,,24.2,,,17.3,2017,2
2,Arizona,64.2,67.4,12.3,4.1,2.2,9.3,14.7,,16.5,1.1,6.3,32.6,2017,4
3,Arkansas,,,,,,,,,,,,,2017,5
4,California,72.0,76.6,9.1,1.3,3.5,9.4,19.7,3.1,22.7,2.3,5.4,23.4,2017,6
