# NSFG Pregnancy Dataset ETL

In [None]:
from typing import List, DefaultDict
from collections import defaultdict
import numpy as np
import pandas as pd

In [None]:
%load_ext autoreload

In [None]:
import sys
sys.path.append('../lib')

In [None]:
import fwf

Load the data file

In [None]:
# nrows = 10000
df = fwf.read_fixed_width(
    '../data/2002FemPreg.dct',
    '../data/2002FemPreg.dat.gz'
)

In [None]:
df.shape

mother's age is encoded in centiyears; convert to years

In [None]:
df.agepreg

In [None]:
df.agepreg /= 100

`birthwgt_lb` contains at least one bogus value (51 lbs) replace with NaN

In [None]:
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb']

In [None]:
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan

replace 'not ascertained', 'refused', 'don't know' with NaN

In [None]:
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)

The sex of the baby contains some abnormal values

In [None]:
df.babysex.value_counts().sort_index()

In [None]:
df.babysex.replace([7, 9], np.nan, inplace=True)

same for normal delivery (`nbrnaliv`)

In [None]:
df.nbrnaliv.value_counts()

In [None]:
df.nbrnaliv.replace([9], np.nan, inplace=True)

Lets make them 8 bit integers

In [None]:
df = df.astype({
    'nbrnaliv': pd.Int8Dtype(),
    'babysex': pd.Int8Dtype()
})

birthweight is stored in two columns, lbs and oz. convert to a single column in lb

In [None]:
df = df.assign(totalwgt_lb = df['birthwgt_lb'] + (df['birthwgt_oz'] / 16))

In [None]:
df.head()

In [None]:
df.multbrth.value_counts()

Finally convert some more integer columns

In [None]:
df.multbrth

In [None]:
df.multbrth.astype(pd.Int8Dtype())

In [None]:
df = df.astype({
    'pregend1': pd.Int8Dtype(),
    'pregend2': pd.Int8Dtype(),
    'moscurrp': pd.Int8Dtype(),
    'howpreg_n': pd.Int32Dtype(),
    'howpreg_p': pd.Int32Dtype(),
    'nowprgdk': pd.Int32Dtype(),
    'multbrth': pd.Int8Dtype(),
    'birthwgt_lb': pd.Int32Dtype(),
    'birthwgt_oz': pd.Int32Dtype(),
    'birthord': pd.Int32Dtype()
})

In [None]:
# save in a format that preserves the data types
df.to_feather('../data/2002FemPreg.feather')

In [None]:
df = pd.read_feather('../data/2002FemPreg.feather')

Check data

In [None]:
assert len(df) == 13593
assert df.caseid[13592] == 12571
assert df.pregordr.value_counts()[1] == 5033
assert df.nbrnaliv.value_counts()[1] == 8981
assert df.babysex.value_counts()[1] == 4641
assert df.birthwgt_lb.value_counts()[7] == 3049
assert df.birthwgt_oz.value_counts()[0] == 1037


assert df.prglngth.value_counts()[39] == 4744
assert df.outcome.value_counts()[1] == 9148
assert df.birthord.value_counts()[1] == 4413
assert df.agepreg.value_counts()[22.75] == 100
assert df.totalwgt_lb.value_counts()[7.5] == 302

In [None]:
weights = df.finalwgt.value_counts()
key = max(weights.keys())
assert df.finalwgt.value_counts()[key] == 6

## Interpretation

To work with data effectively, you have to think on two levels at the same time: the level of statistics and the level of context.

As an example, let’s look at the sequence of outcomes for a few respondents. Because of the way the data files are organized, we have to do some processing to collect the pregnancy data for each respondent. Here’s a function that does that:

validate that the pregnum column in `resp` matches the number of entries in `preg`

In [None]:
def make_preg_map(df: pd.DataFrame) -> DefaultDict[int, List[int]]:
    """
    Make a map from caseid to list of preg indices.

    df: DataFrame

    returns: dict that maps from caseid to list of indices into `preg`
    """
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d

In [None]:
preg_map = make_preg_map(df)

In [None]:
# case id 23 appears in lines 21 and 22
preg_map[23]

In [None]:
df.loc[preg_map[23], 'caseid']

Load the responses

In [None]:
import nsfg

In [None]:
resp = nsfg.read_fem_resp('../data')

In [None]:
assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)

In [None]:
def validate_preg_num(resp: pd.DataFrame, preg: pd.DataFrame) -> bool:
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    preg: pregnancy DataFrame
    """
    # make the map from caseid to list of pregnancy indices
    preg_map = make_preg_map(preg)
    
    # iterate through the respondent pregnum series
    for index, pregnum in resp.pregnum.iteritems():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), pregnum)
            return False

    return True

In [None]:
validate_preg_num(resp, df)