In [None]:
import re
import numpy as np

In [None]:
import gzip

In [None]:
import pandas as pd

In [None]:
# Copied from Allen DOwney with some modification

class FixedWidthVariables(object):
    """Represents a set of variables in a fixed width file."""

    def __init__(self, variables, index_base=0):
        """Initializes.

        variables: DataFrame
        index_base: are the indices 0 or 1 based?

        Attributes:
        colspecs: list of (start, end) index tuples
        names: list of string variable names
        """
        self.variables = variables

        # note: by default, subtract 1 from colspecs
        self.colspecs = variables[['start', 'end']] - index_base

        # convert colspecs to a list of pair of int
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']
    
    def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.

        filename: string filename

        returns: DataFrame
        """
        df = pd.read_fwf(filename,
                             colspecs=self.colspecs, 
                             names=self.names,
                             **options)
        return(df)

In [None]:
def CleanFemPreg(df):
    '''Recodes variables from the pregnancy frame.

    df: DataFrame
    '''
    # mother's age is encoded in centiyears; convert to years
    df.agepreg /= 100.0

    # birthwgt_lb contains at least one bogus value (51 lbs)
    # replace with NaN
    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
    
    # replace 'not ascertained', 'refused', 'don't know' with NaN
    na_vals = [97, 98, 99]
    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
    df.hpagelb.replace(na_vals, np.nan, inplace=True)

    df.babysex.replace([7, 9], np.nan, inplace=True)
    df.nbrnaliv.replace([9], np.nan, inplace=True)

    # birthweight is stored in two columns, lbs and oz.
    # convert to a single column in lb
    # NOTE: creating a new column requires dictionary syntax,
    # not attribute assignment (like df.totalwgt_lb)
    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    

    # due to a bug in ReadStataDct, the last variable gets clipped;
    # so for now set it to NaN
    df.cmintvw = np.nan
#    return(df)

In [None]:
# Create dictionary from the first dictionary data set

def ReadStataDct(dct_file):
    type_map=dict(byte=int,int=int,long=int,float=float,double=float)
    var_info= []
    for line in open(dct_file):
        match = re.search(r'_column\(([^)]*)\)',line)
        if match:
            start = int(match.group(1))
            t = line.split()
            vtype,name,fstring = t[1:4]
            name=name.lower()
            if vtype.startswith('str'):
                vtype='str'
            else:
                vtype=type_map[vtype]
            long_desc=' '.join(t[4:]).strip('"')
            var_info.append((start,vtype,name,fstring,long_desc))
    columns=['start','type','name','fstring','desc']
    variables=pd.DataFrame(var_info,columns=columns)
    variables['end']=variables.start.shift(-1)
    variables.loc[len(variables)-1,'end']=0
    dct = FixedWidthVariables(variables,index_base=1)
    return(dct)
    
    

In [None]:
ReadFemPreg(dct_file='2002FemPreg.dct',dat_file='2002FemPreg.dat.gz'):
    """Reads the NSFG pregnancy data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip')
    CleanFemPreg(df)
    return df

In [None]:
## We read dictionary into dct, then dct calls read fixed
dct=ReadStataDct("2002FemPreg.dct")
df=dct.ReadFixedWidth("2002FemPreg.dat.gz",compression='gzip')
CleanFemPreg(df)

#dct.colspecs
#dct.variables
#dct.names

In [None]:
df.outcome.value_counts(sort=False)

In [None]:
df.birthwgt_lb.value_counts(sort=False)

In [292]:
# Craete a dictionary , initialize with default value as empty list followed by updating dictionary with key value as 'CASE ID' and
# values as INDEX number from the data set
def MakePregMap(df):
    d={}
    for item in np.unique(df.caseid):
        d.setdefault(item,[])
    for index,caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return(d)

In [301]:
caseid=10229
preg_map=MakePregMap(df)
indices=preg_map[caseid]
df.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

## Compare the result with Birthord [here](https://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [316]:
df.birthord.value_counts(sort=False)

1.0     4413
2.0     2874
3.0     1234
4.0      421
5.0      126
6.0       50
7.0       20
8.0        7
10.0       1
9.0        2
Name: birthord, dtype: int64

In [318]:
df.pregordr.value_counts()

1     5033
2     3766
3     2334
4     1224
5      613
6      308
7      158
8       78
9       38
10      17
11       8
12       5
13       3
14       3
18       1
19       1
17       1
16       1
15       1
Name: pregordr, dtype: int64

** Use birthord to select the records for first babies and others. How many are there of each? **

In [328]:
firsts= df[df.birthord==1]
others = df[df.birthord>1]
len(firsts), len(others)

(4413, 4735)

In [330]:
firsts.totalwgt_lb.mean(), others.totalwgt_lb.mean()

(7.201094430437772, 7.325855614973262)

** Compute the mean prglngth for first babies and others. Compute the difference in means, expressed in hours.**

In [335]:
firsts_preglngth_mean=firsts.prglngth.mean()
second_preglngth_mean=others.prglngth.mean()

In [340]:
# Difference in hours
(firsts_preglngth_mean - second_preglngth_mean)*7*24

13.11026081862832