# NSFG Pregnancy Dataset ETL

In [1]:
from typing import List, DefaultDict
from collections import defaultdict
import numpy as np
import pandas as pd

In [2]:
import sys
sys.path.append('../lib')

In [3]:
import fwf

Load the data file

In [4]:
# nrows = 10000
df = fwf.read_stata_fixed_width(
    '../data/2002FemPreg.dct',
    '../data/2002FemPreg.dat.gz'
)

In [5]:
df.shape

(13593, 243)

mother's age is encoded in centiyears; convert to years

In [6]:
df.agepreg

0        3316.0
1        3925.0
2        1433.0
3        1783.0
4        1833.0
          ...  
13588    1791.0
13589    1850.0
13590    1975.0
13591    2158.0
13592    2158.0
Name: agepreg, Length: 13593, dtype: float64

In [7]:
df.agepreg /= 100

`birthwgt_lb` contains at least one bogus value (51 lbs) replace with NaN

In [8]:
len(df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'])

60

In [9]:
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan

replace 'not ascertained', 'refused', 'don't know' with NaN

In [10]:
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)

The sex of the baby contains some abnormal values

In [11]:
df.babysex.value_counts().sort_index()

1.0    4641
2.0    4500
7.0       1
9.0       2
Name: babysex, dtype: int64

In [12]:
df.babysex.replace([7, 9], np.nan, inplace=True)

same for normal delivery (`nbrnaliv`)

In [13]:
df.nbrnaliv.value_counts()

1.0    8981
2.0     138
3.0      14
5.0       6
4.0       5
9.0       4
Name: nbrnaliv, dtype: int64

In [14]:
df.nbrnaliv.replace([9], np.nan, inplace=True)

birthweight is stored in two columns, lbs and oz. convert to a single column in lb

In [15]:
df = df.assign(totalwgt_lb = df['birthwgt_lb'] + (df['birthwgt_oz'] / 16))

In [16]:
df.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,6.1875


Finally convert some more integer columns

In [17]:
na_counts = df.apply(lambda col: col.isna().sum()).sort_values(ascending=False)

In [18]:
df.shape

(13593, 244)

In [19]:
columns_to_drop = na_counts[na_counts > len(df)//2].index.values

In [20]:
df = df.drop(columns_to_drop, axis=1)

In [21]:
df.shape

(13593, 119)

In [22]:
na_counts = df.loc[df.outcome == 1].dropna(subset=['birthwgt_lb']).apply(lambda col: col.isna().sum())

In [23]:
len(na_counts[na_counts == 0])

100

In [24]:
df = df.query('outcome == 1').dropna(subset=['totalwgt_lb'])

In [25]:
df.shape

(9038, 119)

In [26]:
df.head()

Unnamed: 0,caseid,pregordr,pregend1,nbrnaliv,prgoutcome,cmprgend,cmprgbeg,gestasun_m,gestasun_w,wksgest,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,6.0,1.0,1.0,1093.0,1084.0,9.0,0.0,39.0,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231,8.8125
1,1,2,6.0,1.0,1.0,1166.0,1157.0,9.0,0.0,39.0,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,1231,7.875
2,2,1,5.0,3.0,1.0,1156.0,1147.0,0.0,39.0,39.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,9.125
3,2,2,6.0,1.0,1.0,1198.0,1189.0,0.0,39.0,39.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,7.0
4,2,3,6.0,1.0,1.0,1204.0,1195.0,0.0,39.0,39.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,1231,6.1875


Convert indicator columns to integers

In [27]:
i_cols = [col for col in df.columns if col.endswith('_i')]

In [28]:
df = df.astype(dict([(col, int) for col in i_cols]))

In [29]:
df.poverty_i.value_counts()

0    8539
1     499
Name: poverty_i, dtype: int64

In [31]:
df = df.astype({
    'pregend1': int,
    'nbrnaliv': int,
    'prgoutcome': int,
    'gestasun_m': int,
    'gestasun_w': int,
    'babysex': int,
    'birthwgt_lb': int,
    'birthord': int,
})

In [32]:
df['birthcat'] = pd.Categorical(np.where(df.birthord==1, 'firsts', 'others'))

  df['birthcat'] = pd.Categorical(np.where(df.birthord==1, 'firsts', 'others'))


In [33]:
df = df.reset_index()

In [34]:
# save in a format that preserves the data types
df.to_feather('../data/live_births.feather')

In [None]:
df = pd.read_feather('../data/2002FemPreg.feather')

Check data

In [None]:
assert len(df) == 13593
assert df.caseid[13592] == 12571
assert df.pregordr.value_counts()[1] == 5033
assert df.nbrnaliv.value_counts()[1] == 8981
assert df.babysex.value_counts()[1] == 4641
assert df.birthwgt_lb.value_counts()[7] == 3049
assert df.birthwgt_oz.value_counts()[0] == 1037


assert df.prglngth.value_counts()[39] == 4744
assert df.outcome.value_counts()[1] == 9148
assert df.birthord.value_counts()[1] == 4413
assert df.agepreg.value_counts()[22.75] == 100
assert df.totalwgt_lb.value_counts()[7.5] == 302

In [None]:
weights = df.finalwgt.value_counts()
key = max(weights.keys())
assert df.finalwgt.value_counts()[key] == 6

## Interpretation

To work with data effectively, you have to think on two levels at the same time: the level of statistics and the level of context.

As an example, let’s look at the sequence of outcomes for a few respondents. Because of the way the data files are organized, we have to do some processing to collect the pregnancy data for each respondent. Here’s a function that does that:

validate that the pregnum column in `resp` matches the number of entries in `preg`

In [None]:
def make_preg_map(df: pd.DataFrame) -> DefaultDict[int, List[int]]:
    """
    Make a map from caseid to list of preg indices.

    df: DataFrame

    returns: dict that maps from caseid to list of indices into `preg`
    """
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d

In [None]:
preg_map = make_preg_map(df)

In [None]:
# case id 23 appears in lines 21 and 22
preg_map[23]

In [None]:
df.loc[preg_map[23], 'caseid']

Load the responses

In [None]:
import nsfg

In [None]:
resp = nsfg.read_fem_resp('../data')

In [None]:
assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)

In [None]:
def validate_preg_num(resp: pd.DataFrame, preg: pd.DataFrame) -> bool:
    """Validate pregnum in the respondent file.

    resp: respondent DataFrame
    preg: pregnancy DataFrame
    """
    # make the map from caseid to list of pregnancy indices
    preg_map = make_preg_map(preg)
    
    # iterate through the respondent pregnum series
    for index, pregnum in resp.pregnum.iteritems():
        caseid = resp.caseid[index]
        indices = preg_map[caseid]

        # check that pregnum from the respondent file equals
        # the number of records in the pregnancy file
        if len(indices) != pregnum:
            print(caseid, len(indices), pregnum)
            return False

    return True

In [None]:
validate_preg_num(resp, df)