# Exploring Data - Distributions

In [None]:
from collections import Counter
import numpy as np
import pandas as pd

In [None]:
import sys
sys.path.append('lib')

In [None]:
import nsfg

In [None]:
import seaborn as sns
from IPython.core.pylabtools import figsize
sns.set_theme()
figsize(11, 5)

Given a list of values, there are several ways to count the frequency of each value.

In [None]:
t = [1, 2, 2, 3, 5]

You can use a Python dictionary:

In [None]:
hist = {}
for x in t:
    hist[x] = hist.get(x, 0) + 1
    
hist

You can use a `Counter` (which is a dictionary with additional methods):

In [None]:
counter = Counter(t)
counter

In [None]:
counter[2]

If the value does not appear, it has frequency 0.

In [None]:
counter.get(4, 0)

The `keys` method returns the values:

In [None]:
counter.keys()

So you can iterate the values and their frequencies like this:

In [None]:
for val in sorted(counter.keys()):
    print(val, counter[val])

Or you can use the `Items` method:

In [None]:
for val, freq in counter.items():
     print(val, freq)

In [None]:
series = pd.Series(data=counter.values(), index=counter.keys())

In [None]:
series.plot(
    kind='bar',
    xlabel='value',
    ylabel='frequency'
);

As an example, I'll replicate some of the figures from the book.

First, I'll load the data from the pregnancy file and select the records for live births.

In [None]:
live = nsfg.read_fem_preg().query('outcome == 1')

In [None]:
live.birthwgt_lb

In [None]:
live.birthwgt_lb.value_counts().sort_index()

Here's the histogram of birth weights in pounds.

In [None]:
p = sns.histplot(live.birthwgt_lb, discrete=True);
p.set(
    xlabel='Birth weight (lbs)',
    title='Histogram of live birth weights'
);

Before plotting the ages, I'll apply `floor` to round down:

In [None]:
ages = np.floor(live.agepreg)

In [None]:
p = sns.histplot(ages, binwidth=1);
p.set(
    xlabel='Age (in years)',
    title='Histogram of ages'
);

As an exercise, plot the histogram of pregnancy lengths (column `prglngth`).

In [None]:
p = sns.histplot(live.prglngth, binwidth=1)
p.set(
    xlabel='Length (in weeks)',
    title='Histogram of pregnancy lengths'
);

From live births, we can selection first babies and others using `birthord`, then compute histograms of pregnancy length for the two groups.

In [None]:
pd.Categorical(live.birthord.apply(lambda x: x == 1 and 'firsts' or 'others')).value_counts()

In [None]:
pd.Categorical(np.where(live.birthord==1, 'firsts', 'others')).value_counts()

In [None]:
live['birthcat'] = pd.Categorical(np.where(live.birthord==1, 'firsts', 'others'))

In [None]:
p = sns.histplot(
    data=live.loc[live.prglngth>26],
    x='prglngth',
    binwidth=1,
    hue='birthcat',
    multiple='dodge'
)
p.get_legend().set_title('Birth category')
p.set(
    xlabel = 'Length of pregnancy (in weeks)'
);

We can use `width` and `align` to plot two histograms side-by-side.

`Series` provides methods to compute summary statistics:

In [None]:
mean = live.prglngth.mean()
var = live.prglngth.var()
std = live.prglngth.std()

Here are the mean and standard deviation:

In [None]:
mean, std

As an exercise, confirm that `std` is the square root of `var`:

In [None]:
# Solution

np.sqrt(var) == std

Here's are the mean pregnancy lengths for first babies and others:

In [None]:
live.groupby('birthcat').agg({'prglngth': 'mean'})

In [None]:
category_means = live.groupby('birthcat')['prglngth'].mean()

In [None]:
category_means

And here's the difference (in weeks):

In [None]:
# and the difference (in weeks)
np.abs(np.diff(category_means)).item()

This functon computes the Cohen effect size, which is the difference in means expressed in number of standard deviations:

In [None]:
def cohen_effect_size(group1: np.array, group2: np.array) -> np.float64:
    """Computes Cohen's effect size for two groups.
    
    group1: Series or DataFrame
    group2: Series or DataFrame
    
    returns: float if the arguments are Series;
             Series if the arguments are DataFrames
    """
    diff = group1.mean() - group2.mean()

    var1 = group1.var()
    var2 = group2.var()
    n1, n2 = len(group1), len(group2)

    pooled_var = (n1 * var1 + n2 * var2) / (n1 + n2)
    d = diff / np.sqrt(pooled_var)
    return d

Compute the Cohen effect size for the difference in pregnancy length for first babies and others.

In [None]:
lengths = live.prglngth
firsts = live.birthcat == 'firsts'
others = live.birthcat == 'others'

In [None]:
cohen_effect_size(
    lengths[firsts],
    lengths[others]
)

## Exercises

Using the variable `totalwgt_lb`, investigate whether first babies are lighter or heavier than others. 

Compute Cohen’s effect size to quantify the difference between the groups.  How does it compare to the difference in pregnancy length?

In [None]:
live.groupby('birthcat')['totalwgt_lb'].mean()

In [None]:
totalwgt_lb = live.totalwgt_lb
cohen_effect_size(
    totalwgt_lb[firsts],
    totalwgt_lb[others]
)

For the next few exercises, we'll load the respondent file:

In [None]:
resp = nsfg.read_fem_resp()

Make a histogram of <tt>totincr</tt> the total income for the respondent's family.  To interpret the codes see the [codebook](ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/Dataset_Documentation/NSFG/Cycle6Codebook-Pregnancy.pdf).

In [None]:
p = sns.histplot(resp.totincr, discrete=True);
p.set(
    xlabel = 'Income category',
    title = 'Historgram of total respondent family income'
);

Make a histogram of <tt>age_r</tt>, the respondent's age at the time of interview.

In [None]:
p = sns.histplot(resp.ager, discrete=True)
p.set(
    xlabel = 'Age (in years)',
    title = 'Histogram of respondents age at time of interview'
);

Make a histogram of <tt>numfmhh</tt>, the number of people in the respondent's household.

In [None]:
p = sns.histplot(resp.numfmhh, discrete=True)
p.set(
    xlabel = 'Number of people',
    title = 'The number of people in the respondents household'
);

Make a histogram of <tt>parity</tt>, the number of children borne by the respondent.  How would you describe this distribution?

In [None]:
p = sns.histplot(resp.parity, discrete=True)
p.set(
    xlabel = 'Parity',
    title = 'The number of children borne by the respondent'
);

This distribution is positive-valued and skewed to the right

Find the largest values of <tt>parity</tt>.

In [None]:
resp.parity.value_counts().sort_index(ascending=False)[:10]

To get them as a list of python tuples

In [None]:
list(resp.parity.value_counts().sort_index(ascending=False).iteritems())[:10]

Let's investigate whether people with higher income have higher parity.  Keep in mind that in this study, we are observing different people at different times during their lives, so this data is not the best choice for answering this question.  But for now let's take it at face value.

Use <tt>totincr</tt> to select the respondents with the highest income (level 14).  Plot the histogram of <tt>parity</tt> for just the high income respondents.

In [None]:
# this is the subset we want
resp.loc[resp.totincr==14, ['parity']].head()

In [None]:
p = sns.histplot(
    data=resp.loc[resp.totincr==14, ['parity']],
    x='parity',
    discrete=True
)
p.set(
    xlabel='Parity',
    title='The number of children borne for high income respondents'
);

Find the largest parities for high income respondents.

In [None]:
income_level = pd.Categorical(np.where(resp.totincr == 14, 'High', 'Low'))

In [None]:
income_level.value_counts()

In [None]:
resp[income_level == 'High'].parity.value_counts().sort_index(ascending=False)

Compare the mean <tt>parity</tt> for high income respondents and others.

In [None]:
incomes = pd.DataFrame({'income': resp.totincr, 'parity': resp.parity, 'level': income_level})

In [None]:
incomes.level.value_counts()

In [None]:
incomes.groupby('level')['parity'].mean()

In [None]:
# Solution

not_rich = resp[resp.totincr < 14]
rich.parity.mean(), not_rich.parity.mean()

Compute the Cohen effect size for this difference.  How does it compare with the difference in pregnancy length for first babies and others?

In [None]:
cohen_effect_size(
    incomes.parity[incomes.level=='High'],
    incomes.parity[incomes.level=='Low']
)

This effect is about 10 times stronger than the difference in pregnancy length. But remembering the design of the study, we should not make too much of this apparent effect.