Exercise from Think Stats, 2nd Edition (thinkstats2.com)<br>
Allen Downey

In [1]:
%matplotlib inline



Suppose one of your co-workers is expecting a baby and you are participating in an office pool to predict the date of birth. Assuming that bets are placed during the 30th week of pregnancy, what variables could you use to make the best prediction? You should limit yourself to variables that are known before the birth, and likely to be available to the people in the pool. 

In [2]:
import nsfg

data = nsfg.ReadFemPreg()

In [5]:
data_filtered = data[(data.prglngth > 30) & (data.prgoutcome == 1)]

In [6]:
import thinkstats2

def ReadFemResp(dct_file='2002FemResp.dct',
                dat_file='2002FemResp.dat.gz',
                nrows=None):
    """Reads the NSFG respondent data.

    dct_file: string file name
    dat_file: string file name

    returns: DataFrame
    """
    dct = thinkstats2.ReadStataDct(dct_file)
    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
    return df

respondents = ReadFemResp()

In [24]:
import pandas as pd
import operator
import numpy as np

joined = data_filtered.join(respondents, on='caseid', rsuffix='_r')
data_mining = {}
for column_name, column_data in joined.iteritems():
    if column_name == 'prglngth':
        continue
    if column_data.isnull().mean() < 0.10:
        relevant_columns = pd.concat((column_data, joined.prglngth), axis=1).dropna()
        try:
            intercept, slope = thinkstats2.LeastSquares(relevant_columns[column_name],
                                                        relevant_columns.prglngth)
            res = slope*relevant_columns[column_name]+intercept - relevant_columns.prglngth
            coef = thinkstats2.CoefDetermination(relevant_columns.prglngth, res)
            if not np.isnan(coef):
                data_mining[column_name] = coef
        except Exception as ex:
            print ex

sorted_vars = reversed(sorted(data_mining.items(), key=operator.itemgetter(1)))
for var in sorted_vars:
    print var

(u'wksgest', 0.80624341161392232)
('totalwgt_lb', 0.12445743148119393)
(u'birthwgt_lb', 0.11977307804917148)
(u'lbw1', 0.10372542204583945)
(u'mosgest', 0.095624319895933896)
(u'prglngth_i', 0.022053775796484043)
(u'nbrnaliv', 0.0045775657855329221)
(u'pregend1', 0.0022493894338145859)
(u'cmlastlb', 0.0020431424422129307)
(u'fmarcon5_i', 0.0019681593242774076)
(u'gestasun_m', 0.0016571319550200414)
(u'sest', 0.0013223681981762159)
(u'cmlstprg', 0.0012828619646515493)
(u'birthord', 0.0012372736736778744)
(u'poverty', 0.0011234153757080367)
(u'gestasun_w', 0.0010513799087294995)
(u'cmintstr', 0.00087396929583005711)
(u'fmarout5_i', 0.00083790938191452558)
(u'rmarout6_i', 0.0007917681629084905)
(u'hispanic', 0.00066009164476110715)
(u'pregordr', 0.00062224148607814556)
(u'educat', 0.00059068244034643502)
(u'fmarital', 0.00057247806406190982)
(u'rmarital', 0.00056462084046027261)
(u'oldwantr', 0.00050750617871297976)
(u'wantresp', 0.00049918540108428999)
(u'agepreg_i', 0.000428924881668013

Of these, only a subset would actually reasonably be available to us.  I'd choose: race, nbrnaliv, birthord, fmarcon5.

In [25]:
from sklearn.linear_model import LinearRegression
import numpy as np

model = LinearRegression()
filtered = joined.dropna(subset=['birthord', 'poverty', 'nbrnaliv','prglngth'])
X = filtered[['birthord', 'race', 'nbrnaliv', 'fmarcon5']]
y = filtered.prglngth
model.fit(X, y)
print "RMS model", np.sqrt(((model.predict(X) - y)**2).mean())
print "RMS baseline", np.sqrt(((y - y.mean())**2).mean())

RMS model 1.89264008596
RMS baseline 1.89815725743
