In [1]:
import pandas as pd
import pickle
import numpy as np

# Feature Engineering

In [2]:
# Load the data and column descriptions.
df = pd.read_csv('cleaned_nsfg_data.csv')
with open('2015_2017_FemPregData.pickle', 'rb') as fs:
    cols = pickle.load(fs)
df.drop(columns=['Unnamed: 0'], inplace=True)

# Add a constant column for regression later
df['const'] = 1

The first step is one-hot encoding all data fields that are not numeric. These data fields include marital status, race, religion, whether the child was wanted, the outcome of the pregnancy, and others. These fields should be one-hot encoded because otherwise regression cannot be performed on them.

In [3]:
one_hot_fields = ['EVUSEINT', 'STOPDUSE', 'WANTBOLD', 'HPWNOLD', 
                  'COHPBEG', 'COHPEND', 'OUTCOME', 'WANTPART',
                  'NEWWANTR', 'RMARITAL', 'RACE', 'LABORFOR',
                  'RELIGION']

for field in one_hot_fields:
    one_hot = pd.get_dummies(df[field],prefix=field)
    df.drop(columns=[field],inplace=True)
    df = df.join(one_hot)
    
df.head()

Unnamed: 0,CASEID,PREGORDR,BORNALIV,BIRTHWGT_LB1,BIRTHWGT_OZ1,PRGLNGTH,AGECON,EDUCAT,HISPANIC,PREGNUM,...,LABORFOR_Maternity-Family-leave,LABORFOR_Other,LABORFOR_Part-time,LABORFOR_School,LABORFOR_Stay-at-home,LABORFOR_Temp-or-ill,RELIGION_Catholic,RELIGION_None,RELIGION_Other,RELIGION_Protestant
0,70627,1,1,7.0,8.0,40,28,16,0,3,...,0,0,0,0,0,0,1,0,0,0
1,70627,2,0,,,14,32,16,0,3,...,0,0,0,0,0,0,1,0,0,0
2,70627,3,1,9.0,2.0,39,33,16,0,3,...,0,0,0,0,0,0,1,0,0,0
3,70628,1,1,6.0,9.0,39,17,15,0,3,...,0,0,0,0,0,0,0,0,0,1
4,70628,2,1,7.0,0.0,39,19,15,0,3,...,0,0,0,0,0,0,0,0,0,1


Add birthweights - originally, birthweights are recorded in two separate columns, one for pounds and one for ounces. We will convert both to kilograms and add them together in a new column. This is a good decision because it doesn't make sense to look at either of these columns separately. Just the number of ounces (less than a pound) that a baby weighed probably won't have any predictive power.

In [4]:
cols['BIRTHWGT_KG'] = 'Birthweight in Kilograms'
df['BIRTHWGT_KG'] = (
    pd.to_numeric(df['BIRTHWGT_LB1'])/2.205 
    + pd.to_numeric(df['BIRTHWGT_OZ1'])/35.74)
df.drop(columns=['BIRTHWGT_LB1', 'BIRTHWGT_OZ1'],inplace=True)
df['BIRTHWGT_KG']

0       3.398442
1            NaN
2       4.137592
3       2.972907
4       3.174603
          ...   
9347    3.684078
9348    3.230563
9349         NaN
9350    3.963876
9351    3.084827
Name: BIRTHWGT_KG, Length: 9352, dtype: float64

Add a column that shows whether a specific person (denoted by CASEID) has ever had a failed pregnancy before the current pregnancy. This feature could be very useful in predicting future miscarriages.

In [5]:
df['HAS_HAD_MISC'] = 0
for case_id in set(df['CASEID']):
    has_had_miscarriage = 0
    bornaliv_col = df[df['CASEID'] == case_id]['BORNALIV']
    for idx in bornaliv_col.index:
        df.loc[idx,'HAS_HAD_MISC'] = has_had_miscarriage
        if df.loc[idx,'BORNALIV'] == 0:
            has_had_miscarriage = 1
            
df.head()

Unnamed: 0,CASEID,PREGORDR,BORNALIV,PRGLNGTH,AGECON,EDUCAT,HISPANIC,PREGNUM,PUBASSIS,POVERTY,...,LABORFOR_Part-time,LABORFOR_School,LABORFOR_Stay-at-home,LABORFOR_Temp-or-ill,RELIGION_Catholic,RELIGION_None,RELIGION_Other,RELIGION_Protestant,BIRTHWGT_KG,HAS_HAD_MISC
0,70627,1,1,40,28,16,0,3,0,5.0,...,0,0,0,0,1,0,0,0,3.398442,0
1,70627,2,0,14,32,16,0,3,0,5.0,...,0,0,0,0,1,0,0,0,,0
2,70627,3,1,39,33,16,0,3,0,5.0,...,0,0,0,0,1,0,0,0,4.137592,1
3,70628,1,1,39,17,15,0,3,0,1.89,...,0,0,0,0,0,0,0,1,2.972907,0
4,70628,2,1,39,19,15,0,3,0,1.89,...,0,0,0,0,0,0,0,1,3.174603,0


In [6]:
# Save the feature-engineered data
df.to_csv('featured_nsfg_data.csv')