# Examples and Exercises from Think Stats, 2nd Edition

http://thinkstats2.com

Copyright 2016 Allen B. Downey

MIT License: https://opensource.org/licenses/MIT


In [1]:
from __future__ import print_function, division

import nsfg
from collections import defaultdict

## Examples from Chapter 1

Read NSFG data into a Pandas DataFrame.

In [2]:
preg = nsfg.ReadFemPreg()
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


Print the column names.

In [3]:
preg.columns

Index(['caseid', 'pregordr', 'howpreg_n', 'howpreg_p', 'moscurrp', 'nowprgdk',
       'pregend1', 'pregend2', 'nbrnaliv', 'multbrth',
       ...
       'laborfor_i', 'religion_i', 'metro_i', 'basewgt', 'adj_mod_basewgt',
       'finalwgt', 'secu_p', 'sest', 'cmintvw', 'totalwgt_lb'],
      dtype='object', length=244)

Select a single column name.

In [4]:
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


In [5]:
used_preg = preg[['caseid', 'prglngth', 'prglngth', 'pregordr', 'pregordr', 'birthwgt_lb',  'birthwgt_oz',  'birthwgt_oz',  'birthwgt_oz']]

In [6]:
preg.columns[1]

'pregordr'

Select a column and check what type it is.

In [7]:
pregordr = preg['pregordr']
type(pregordr)

pandas.core.series.Series

Print a column.

In [8]:
pregordr

0        1
1        2
2        1
3        2
4        3
5        1
6        2
7        3
8        1
9        2
10       1
11       1
12       2
13       3
14       1
15       2
16       3
17       1
18       2
19       1
20       2
21       1
22       2
23       1
24       2
25       3
26       1
27       1
28       2
29       3
        ..
13563    2
13564    3
13565    1
13566    1
13567    1
13568    2
13569    1
13570    2
13571    3
13572    4
13573    1
13574    2
13575    1
13576    1
13577    2
13578    1
13579    2
13580    1
13581    2
13582    3
13583    1
13584    2
13585    1
13586    2
13587    3
13588    1
13589    2
13590    3
13591    4
13592    5
Name: pregordr, Length: 13593, dtype: int64

Select a single element from a column.

In [9]:
pregordr[0]

1

Select a slice from a column.

In [10]:
pregordr[2:5]

2    1
3    2
4    3
Name: pregordr, dtype: int64

Select a column using dot notation.

In [11]:
pregordr = preg.pregordr

Count the number of times each value occurs.

In [12]:
preg.outcome.value_counts().sort_index()

1    9148
2    1862
3     120
4    1921
5     190
6     352
Name: outcome, dtype: int64

Check the values of another variable.

In [13]:
preg.birthwgt_lb.value_counts().sort_index()

0.0        8
1.0       40
2.0       53
3.0       98
4.0      229
5.0      697
6.0     2223
7.0     3049
8.0     1889
9.0      623
10.0     132
11.0      26
12.0      10
13.0       3
14.0       3
15.0       1
Name: birthwgt_lb, dtype: int64

Make a dictionary that maps from each respondent's `caseid` to a list of indices into the pregnancy `DataFrame`.  Use it to select the pregnancy outcomes for a single respondent.

We have already seen two variables in the NSFG dataset, caseid and
pregordr, and we have seen that there are 244 variables in total. For the
explorations in this book, I use the following variables:
- caseid is the integer ID of the respondent.8 Chapter 1. Exploratory data analysis
- prglngth is the integer duration of the pregnancy in weeks.
- outcome is an integer code for the outcome of the pregnancy. The code
1 indicates a live birth.
-  pregordr is a pregnancy serial number; for example, the code for a
respondent’s first pregnancy is 1, for the second pregnancy is 2, and so
on.
 birthord is a serial number for live births; the code for a respondent’s
first child is 1, and so on. For outcomes other than live birth, this field
is blank.
- birthwgt_lb and birthwgt_oz contain the pounds and ounces parts
of the birth weight of the baby.
- agepreg is the mother’s age at the end of the pregnancy.
- finalwgt is the statistical weight associated with the respondent. It is
a floating-point value that indicates the number of people in the U.S.
population this respondent rep

In [14]:
preg.head()

Unnamed: 0,caseid,pregordr,howpreg_n,howpreg_p,moscurrp,nowprgdk,pregend1,pregend2,nbrnaliv,multbrth,...,laborfor_i,religion_i,metro_i,basewgt,adj_mod_basewgt,finalwgt,secu_p,sest,cmintvw,totalwgt_lb
0,1,1,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,8.8125
1,1,2,,,,,6.0,,1.0,,...,0,0,0,3410.389399,3869.349602,6448.271112,2,9,,7.875
2,2,1,,,,,5.0,,3.0,5.0,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,9.125
3,2,2,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,7.0
4,2,3,,,,,6.0,,1.0,,...,0,0,0,7226.30174,8567.54911,12999.542264,2,12,,6.1875


In [15]:
def MakePregMap(df):
    """Make a map from caseid to list of preg indices.

    df: DataFrame

    returns: dict that maps from caseid to list of indices into `preg`
    """
    d = defaultdict(list)
    for index, caseid in df.caseid.iteritems():
        d[caseid].append(index)
    return d

In [16]:
caseid = 10229
preg_map = MakePregMap(preg)
indices = preg_map[caseid]
preg.outcome[indices].values

array([4, 4, 4, 4, 4, 4, 1], dtype=int64)

In [17]:
preg.totalwgt

AttributeError: 'DataFrame' object has no attribute 'totalwgt'

## Exercises

Select the `birthord` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611933)

In [None]:
preg.birthord.value_counts()

We can also use `isnull` to count the number of nans.

In [None]:
preg.birthord.isnull().sum()

Select the `prglngth` column, print the value counts, and compare to results published in the [codebook](http://www.icpsr.umich.edu/nsfg6/Controller?displayPage=labelDetails&fileCode=PREG&section=A&subSec=8016&srtLabel=611931)

In [None]:
# Solution goes here
count = preg[preg['prglngth'] <= 13].shape[0]
print(f'0 - 13: {count}')
count = preg[(preg['prglngth'] <= 26) & (preg['prglngth'] >= 14) ].shape[0]
print(f'14 - 26: {count}')
count = preg[(preg['prglngth'] <= 50) & (preg['prglngth'] >= 27) ].shape[0]
print(f'27 - 50: {count}')

To compute the mean of a column, you can invoke the `mean` method on a Series.  For example, here is the mean birthweight in pounds:

In [None]:
preg.totalwgt_lb.mean()

Create a new column named <tt>totalwgt_kg</tt> that contains birth weight in kilograms.  Compute its mean.  Remember that when you create a new column, you have to use dictionary syntax, not dot notation.

In [None]:
# Solution goes here
preg['totalwgt_kg'] = preg.totalwgt_lb*0.453592
preg['totalwgt_kg'] 

`nsfg.py` also provides `ReadFemResp`, which reads the female respondents file and returns a `DataFrame`:

In [None]:
resp = nsfg.ReadFemResp()

`DataFrame` provides a method `head` that displays the first five rows:

In [None]:
resp.head()

Select the `age_r` column from `resp` and print the value counts.  How old are the youngest and oldest respondents?

In [None]:
# Solution goes here
resp.age_r.value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.hist(resp.age_r)

We can use the `caseid` to match up rows from `resp` and `preg`.  For example, we can select the row from `resp` for `caseid` 2298 like this:

In [None]:
resp[resp.caseid==2298]

And we can get the corresponding rows from `preg` like this:

In [None]:
preg[preg.caseid==2298]

How old is the respondent with `caseid` 1?

In [None]:
# Solution goes here
resp[resp.caseid==2298]['age_r']

What are the pregnancy lengths for the respondent with `caseid` 2298?

In [None]:
# Solution goes here
preg[preg.caseid==2298]['prglngth']

What was the birthweight of the first baby born to the respondent with `caseid` 5012?

In [None]:
# Solution goes here

In [None]:
preg[(preg.caseid==2298) & (preg.pregordr == 1)]['totalwgt_kg']

In [None]:
np.random.normal(0,1,7)

In [None]:
a = [-0.441, 1.774, -0.101, -1.138, 2.975, -2.138]

In [None]:
np.linalg.matrix_rank(np.random.randn(100))

In [None]:
a = np.array([-0.441, 1.774, -0.101, -1.138, 2.975, -2.138])

In [None]:
a.mean()

In [None]:
def RMSE(estimates,actual):
    e2 = [(estimate-actual)**2 for estimate in estimates]
    mse = np.sum(e2)/len(estimates)
    return np.sqrt(mse)

In [None]:
def estimate_1(n=7,m=1000):
    mu = 0
    sigma = 1
    means = []
    medians = []
    for _ in range(m):
        amostra = np.random.normal(mu,sigma,n)
        means.append(np.mean(amostra))
        medians.append(np.median(amostra))
        
    print(f'Mean RMSE: {RMSE(means,mu)}')
    print(f'Median RMSE: {RMSE(medians,mu)}')

In [None]:
estimate_1()

In [None]:
def RMSE(estimates, actual):
    e2 = [(estimate-actual)**2 for estimate in estimates]
    mse = np.mean(e2)
    return math.sqrt(mse)

In [None]:
Estimate1()

In [None]:
def simulate_sample(mu=90, sigma=7.5, n=9, m=1000):
    means = []
    for _ in range(m):
        amostra = np.random.normal(mu, sigma, n)
        means.append(np.mean(amostra))
    return means

In [None]:
import seaborn as sns

In [None]:
sns.distplot(simulate_sample(),)

In [None]:
plt.hist(simulate_sample(), normed=True, cumulative=True, label='CDF',
         histtype='step', alpha=0.8, color='k',bins=100)

In [None]:
samples_mean = simulate_sample()

In [None]:
np.percentile(samples_mean,5)

In [None]:
np.percentile(samples_mean,95)

In [None]:
np.random.choice(list('HT'))

In [3]:
import random

In [None]:
random.choice('HT')

In [5]:
import think_paulo

In [21]:
values = [1, 2, 3, 4, 5, 6]
a = [np.random.choice(values) for _ in range(60)]

In [29]:
d = think_paulo.HistP(a).d

In [31]:
d

[1, 2, 3, 4, 5, 6]

In [17]:
n = 100
values = [1, 2, 3, 4, 5, 6]
np.random.choice(values)
choices = [np.random.choice(values) for _ in range(n)]
hist = think_paulo.HistP(choices).d

In [9]:
import collections

In [13]:
h = collections.OrderedDict(sorted(hist.items()))

In [14]:
h.values()

odict_values([22, 15, 12, 20, 16, 15])

In [23]:
[hist.get(i+1,0) for i in range(6)]

[18, 16, 13, 20, 22, 11]

In [24]:
np.ones(6) * 100 / 6

array([16.66666667, 16.66666667, 16.66666667, 16.66666667, 16.66666667,
       16.66666667])