# Probability mass functions

In [None]:
import sys
sys.path.append('lib')

In [None]:
from typing import Dict, List, Tuple
import itertools

In [None]:
import numpy as np
import pandas as pd

In [None]:
import nsfg

In [None]:
# graphics
import seaborn as sns
from IPython.core.pylabtools import figsize
sns.set_theme()
figsize(11, 5)

Again, I'll load the NSFG pregnancy file and select live births:

In [None]:
live = nsfg.read_live_fem_preg()

Here's the histogram of birth weights:

In [None]:
p = sns.histplot(live.birthwgt_lb, discrete=True)
p.set(
    xlabel = 'Birth weight (lbs)',
    title = 'Histogram of birth weights'
);

In [None]:
birth_weight_hist = live.birthwgt_lb.value_counts().sort_index()

In [None]:
birth_weight_hist

To normalize the distribution, we could divide through by the total count:

In [None]:
n = np.sum(birth_weight_hist.values)
n

The result is a Probability Mass Function (PMF).

In [None]:
birth_weight_pmf = birth_weight_hist / n
# this can also be done in place
# birth_weight_hist /= n

In [None]:
np.round(birth_weight_pmf, 2)

In [None]:
np.sum(birth_weight_pmf)

In [None]:
p = sns.barplot(
    x=birth_weight_pmf.index,
    y=birth_weight_pmf.values,
    color='royalblue'
);
p.set(
    xlabel = 'Birth weight (lbs)',
    ylabel = 'PMF',
    title = 'PMF of birth weight'
);

Or plot a frequency histogram

In [None]:
p = sns.histplot(live.birthwgt_lb, discrete=True, stat='probability')
p.set(
    xlabel = 'Birth weight (lbs)',
    title = 'PMF of birth weights'
);

More directly, we can create a Pmf object.

In [None]:
birth_weight_hist.sum()

In [None]:
def make_pmf(values: List[int]) -> pd.Series:
    series = pd.Series(values).value_counts().sort_index()
    # normalize
    series /= series.sum()
    return series

In [None]:
pmf = make_pmf([1, 2, 2, 3, 5])
pmf

The index looks up a value and returns its probability:

In [None]:
pmf[2]

The `Incr` method adds to the probability associated with a given values.

In [None]:
2 in pmf

In [None]:
def incr(pmf: pd.Series, value: int, p: np.float):
    if value in pmf:
        # increment the probability
        pmf[value] += 0.2

In [None]:
incr(pmf, 2, 0.2)
pmf[2]

The `Mult` method multiplies the probability associated with a value.

In [None]:
def mult(pmf: pd.Series, value: int, amount: np.float):
    if value in pmf:
        pmf[value] *= amount

In [None]:
mult(pmf, 2, 0.5)
pmf[2]

`sum` returns the total probability (which is no longer 1, because we changed one of the probabilities).

In [None]:
pmf.sum()

`Normalize` divides through by the total probability, making it 1 again.

In [None]:
pmf /= pmf.sum()
pmf.sum()

Lets encapsulate this in a class

In [None]:
class PMF:
    
    @classmethod
    def from_seq(cls, values: List[int]):
        return cls(pd.Series(values).value_counts().sort_index())
    
    @classmethod
    def from_dict(cls, data: Dict[int, int]):
        return cls(pd.Series(data=data.values(), index=data.keys()))
        
    
    def __init__(self, series: pd.Series, normalize=True):
        # compute the frequencies
        self._series = series
        # compute the range of x values
        self._min = self._series.index.min()
        self._max = self._series.index.max()
        # normalize the frequencies into probabilities
        if normalize:
            self.normalize()
        
    def normalize(self):
        '''
        Normalizes this PMF so the sum of all probabilities is 1
        '''
        # divide through by the sum of the values
        self._series /= np.sum(self._series)
        
    def incr(self, x: int, term: np.float):
        '''
        Increments the freq/prob associated with the value x
        '''
        if x in self._series:
            self._series[x] += term
            
    def mult(self, x: int, factor: np.float):
        '''
        Scales the freq/prob associated with the value x
        '''
        if x in self._series:
            self._series[x] *= factor
            
    def prob(self, x: int):
        '''
        Gets the probability associated with the value x
        '''
        return self._series.get(x, 0)
    
    @property
    def total(self) -> np.float:
        return np.sum(self._series)
            
    def __getitem__(self, x):
        '''
        Implements the indexing operator
        '''
        return self.prob(x)
    
    def as_dataframe(self) -> pd.DataFrame:
        return pd.DataFrame(dict(probs=self.series))
        
    @property
    def series(self) -> pd.Series:
        return self._series
    
    @property
    def min(self) -> int:
        # return the smallest value
        return self._min
    
    @property
    def max(self) -> int:
        # return the largest value
        return self._max

    def mean(self) -> np.float64:
        '''
        Computes the mean of a PM
        '''
        return sum(p * x for x, p in self.items())
    
    def arange(self, increment=1) -> np.array:
        # include missing
        return np.arange(self._min, self._max+1, increment)

    @property
    def probs(self) -> np.array:
        return np.array([self[x] for x in self.arange()])
    
    @property
    def values(self) -> np.array:
        return self._series.index.values
    
    @property
    def probabilities(self) -> np.array:
        return self._series.values

    def items(self) -> List[Tuple[int, float]]:
        return self._series.iteritems()
    
    def copy(self):
        return self.__class__(self._series.copy(), normalize=False)
    
    def __str__(self):
        return str(self._series)

In [None]:
values = [1, 2, 2, 3, 5]
pmf = PMF.from_seq(values)
pmf.series

In [None]:
# should sum to 1
pmf.total

In [None]:
pmf.probs

In [None]:
print(f'P(2) = {pmf[2]}')

In [None]:
pmf.incr(2, 0.2)
pmf.series

In [None]:
pmf.mult(2, 0.5)
pmf.series

In [None]:
pmf.total

In [None]:
pmf.normalize()
pmf.total

In [None]:
pmf.as_dataframe()

Here's the PMF of pregnancy length for live births.

In [None]:
pmf.arange()

In [None]:
pmf = PMF.from_seq(live.prglngth)

In [None]:
pmf.max

In [None]:
pmf.arange(5)

In [None]:
p = sns.barplot(
    x = pmf.arange(),
    y = pmf.probs,
    color='darkred',
)
p.set(
    xlabel = 'Pregnancy length (weeks)',
    ylabel = 'Pmf',
    xticks = pmf.arange(5)
);

Here is a more direct way of doing it

In [None]:
p = sns.histplot(
    live.prglngth,
    discrete=True,
    stat = 'probability'
)
p.set(
    xlabel = 'Pregnancy length (weeks)'
);

Lets create a birth category that distinguishes first births from the others

In [None]:
pd.Categorical(np.where(live.birthord == 1, 'Firsts', 'Others')).value_counts()

In [None]:
live['birthcat'] = pd.Categorical(np.where(live.birthord == 1, 'Firsts', 'Others'))

Here are the distributions of pregnancy length.

In [None]:
p = sns.histplot(
    data=live.query('prglngth > 26'),
    x='prglngth',
    hue='birthcat',
    multiple='dodge',
    stat='probability'
)
p.set(
    xlabel = 'Pregnancy length (weeks)',
    title = 'Pregnancy length for first births and others'
);

In [None]:
p = sns.histplot(
    data=live.query('prglngth > 26'),
    x='prglngth',
    hue='birthcat',
    stat='probability',
    element='step',
    fill=False
)
p.set(
    xlabel = 'Pregnancy length (weeks)',
    title = 'Pregnancy length for first births and others'
);

In [None]:
first_pmf = PMF.from_seq(live.prglngth[live.birthcat=='Firsts'].values)
other_pmf = PMF.from_seq(live.prglngth[live.birthcat=='Others'].values)

Here's the code that generates a plot of the difference in probability (in percentage points) between first babies and others, for each week of pregnancy (showing only pregnancies considered "full term"). 

In [None]:
weeks = range(35, 46)
diffs = []
for week in weeks:
    p1 = first_pmf.prob(week)
    p2 = other_pmf.prob(week)
    diff = 100 * (p1 - p2)
    diffs.append(diff)

In [None]:
p = sns.barplot(
    x=list(weeks),
    y=diffs,
    color='darkred'
)
p.set(
    xlabel='Pregnancy length (weeks)',
    ylabel='Percentage difference',
    title='Percentage difference between first and other completed births by week'
);

### Biasing and unbiasing PMFs

Here's the example in the book showing operations we can perform with `Pmf` objects.

Suppose we have the following distribution of class sizes.

In [None]:
d = {
    7: 8,
    12: 8,
    17: 14,
    22: 4, 
    27: 6,
    32: 12,
    37: 8,
    42: 3,
    47: 2
}

In [None]:
pmf = PMF.from_dict(d)
pmf.as_dataframe()

This function computes the biased PMF we would get if we surveyed students and asked about the size of the classes they are in.

In [None]:
def bias_pmf(pmf: PMF) -> PMF:
    new_pmf = pmf.copy()

    for x, p in pmf.items():
        # multiply the probability using the class size
        new_pmf.mult(x, x)
        
    new_pmf.normalize()
    return new_pmf

In [None]:
biased_pmf = bias_pmf(pmf)
biased_pmf.as_dataframe()

In [None]:
distributions = pd.DataFrame(dict(
    class_size=d.keys(),
    actual=pmf.series.values,
    observed=biased_pmf.series
))
distributions.head()

In [None]:
# now go from wide format to long
distributions_long = pd.melt(
    distributions,
    id_vars = ['class_size'],
    value_vars = ['actual', 'observed'],
    value_name = 'probability',
    var_name = 'distribution'
)
distributions_long

The following graph shows the difference between the actual and observed distributions.

In [None]:
p = sns.barplot(
    data=distributions_long,
    x = 'class_size',
    y = 'probability',
    hue = 'distribution'
)
p.set(
    xlabel = 'Class size',
    title = 'Comparing actual and observed (biased) distributions'
);

Here is another way you could plot the two distributions

In [None]:
g = sns.catplot(
    x='class_size',
    y='probability',
    row='distribution',
    data=distributions_long,
    kind ='bar',
    color = 'royalblue',
    height=4,
    # width is twice the height
    aspect=2
);
# g.set_axis_labels('Class size');
g.set(xlabel='Class size');

Another way to make the data frame

In [None]:
class_sizes  = d.keys()
distributions = pd.DataFrame(dict(
    class_size = itertools.chain(class_sizes, class_sizes),
    probability = itertools.chain(pmf.series.values, biased_pmf.series.values),
    distribution = pd.Categorical(itertools.chain(
        np.repeat('Actual', len(class_sizes)),
        np.repeat('Observed', len(class_sizes))
    ))
))
distributions

We can make this a function

In [None]:
def create_paired_distributions(
    class_labels: List[int],
    group1: np.array,
    group2: np.array,
    group_labels: Tuple[str, str],
    class_label: str
) -> pd.DataFrame:
    n = len(class_labels)
    return pd.DataFrame({
        class_label: itertools.chain(class_labels, class_labels),
        'probability': itertools.chain(group1, group2),
        'distribution': pd.Categorical(itertools.chain(
            np.repeat(group_labels[0], n),
            np.repeat(group_labels[1], n)
        ))
    })
    

The observed mean is substantially higher than the actual.

In [None]:
print(f'Actual mean: {pmf.mean():0.2f}')
print(f'Observed mean: {biased_pmf.mean():0.2f}')

If we were only able to collect the biased sample, we could "unbias" it by applying the inverse operation.

In [None]:
def unbias_pmf(pmf: PMF) -> PMF:
    new_pmf = pmf.copy()

    for x, p in pmf.items():
        # new_pmf[x] *= 1/x
        new_pmf.mult(x, 1/x)
        
    new_pmf.normalize()
    return new_pmf

We can unbias the biased PMF:

In [None]:
unbiased = unbias_pmf(biased_pmf)
print(f'Unbiased mean: {unbiased.mean():0.2f}')

And plot the two distributions to confirm they are the same.

In [None]:
distributions = create_paired_distributions(
    class_sizes,
    pmf.series.values,
    unbiased.series.values,
    ('Actual', 'Unbiased',),
    'class_size'
)

In [None]:
p = sns.barplot(
    data=distributions,
    x = 'class_size',
    y = 'probability',
    hue = 'distribution'
)
p.set(
    xlabel = 'Class size',
    title = 'Comparing actual and unbiased distributions'
);

## Exercises

**Exercise:** Something like the class size paradox appears if you survey children and ask how many children are in their family. Families with many children are more likely to appear in your sample, and families with no children have no chance to be in the sample.

Use the NSFG respondent variable `numkdhh` to construct the actual distribution for the number of children under 18 in the respondents' households.

Now compute the biased distribution we would see if we surveyed the children and asked them how many children under 18 (including themselves) are in their household.

Plot the actual and biased distributions, and compute their means.

In [None]:
resp = nsfg.read_fem_resp()

In [None]:
pmf = PMF.from_seq(resp.numkdhh)
pmf.mean()

In [None]:
p = sns.histplot(resp.numkdhh, discrete=True, stat='probability')
p.set(
    xlabel = 'Number of children',
    title = 'Number of children under 18 in the respondents houselholds'
);

In [None]:
# Solution

biased = bias_pmf(pmf)
biased.mean()

In [None]:
distributions = create_paired_distributions(
    pmf.series.index.values,
    pmf.series.values,
    biased.series.values,
    ('Actual', 'Unbiased',),
    'number_of_children'
)
distributions

In [None]:
p = sns.barplot(
    data=distributions,
    x = 'number_of_children',
    y = 'probability',
    hue = 'distribution'
)
p.set(
    xlabel = 'Number of children',
    title = 'Comparing actual and biased distributions'
);

**Exercise:** I started this book with the question, "Are first babies more likely to be late?" To address it, I computed the difference in means between groups of babies, but I ignored the possibility that there might be a difference between first babies and others for the same woman.

To address this version of the question, select respondents who have at least two live births and compute pairwise differences. Does this formulation of the question yield a different result?

Hint: use `nsfg.make_preg_map`:

In [None]:
live = nsfg.read_live_fem_preg()

In [None]:
preg_map = nsfg.make_preg_map(live)

In [None]:
diffs = []

for caseid, indices in preg_map.items():
    if len(indices) >= 2:
        pair = live.loc[indices[0:2]].prglngth
        diff = np.diff(pair)[0]
        diffs.append(diff)

In [None]:
PMF.from_seq(diffs).mean()

**Exercise:** In most foot races, everyone starts at the same time. If you are a fast runner, you usually pass a lot of people at the beginning of the race, but after a few miles everyone around you is going at the same speed.
When I ran a long-distance (209 miles) relay race for the first time, I noticed an odd phenomenon: when I overtook another runner, I was usually much faster, and when another runner overtook me, he was usually much faster.

At first I thought that the distribution of speeds might be bimodal; that is, there were many slow runners and many fast runners, but few at my speed.

Then I realized that I was the victim of a bias similar to the effect of class size. The race was unusual in two ways: it used a staggered start, so teams started at different times; also, many teams included runners at different levels of ability.

As a result, runners were spread out along the course with little relationship between speed and location. When I joined the race, the runners near me were (pretty much) a random sample of the runners in the race.

So where does the bias come from? During my time on the course, the chance of overtaking a runner, or being overtaken, is proportional to the difference in our speeds. I am more likely to catch a slow runner, and more likely to be caught by a fast runner. But runners at the same speed are unlikely to see each other.

Write a function called `observed_pmf` that takes a `PMF` representing the actual distribution of runners’ speeds, and the speed of a running observer, and returns a new `PMF` representing the distribution of runners’ speeds as seen by the observer.

To test your function, you can use the results from the James Joyce Ramble 10K in Dedham MA and converts the pace of each runner to mph.

Compute the distribution of speeds you would observe if you ran a relay race at 7 mph with this group of runners.

In [None]:
df = pd.read_feather('data/relay_results.feather')
df.head()

In [None]:
p = sns.histplot(
    data=df,
    x="mph",
    binwidth=0.2,
    stat='probability'
);
p.set(
    xlabel = 'Speed (mph)',
    title = 'Distribution of race speeds'
);

In [None]:
# pmf = PMF.from_seq(np.round(df.mph, 1))
pmf = PMF.from_seq(df.speed)

In [None]:
p = sns.barplot(
    x=pmf.values,
    y=pmf.probabilities,
    color='royalblue'
);
p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')
p.set(
    xlabel = 'Speed (mph)',
    title = 'Distribution of actual speeds'
);

In [None]:
# Solution

def observed_pmf(pmf: PMF, speed: float) -> PMF:
    """Returns a new Pmf representing speeds observed at a given speed.

    The chance of observing a runner is proportional to the difference
    in speed.

    Args:
        pmf: distribution of actual speeds
        speed: speed of the observing runner

    Returns:
        Pmf object
    """
    new_pmf = pmf.copy()
    for val in new_pmf.values:
        diff = abs(val - speed)
        new_pmf.mult(val, diff)
    new_pmf.normalize()
    return new_pmf


In [None]:
biased = observed_pmf(pmf, 7)

In [None]:
p = sns.barplot(
    x=biased.values,
    y=biased.probabilities,
    color='royalblue'
)
p.set_xticklabels(p.get_xticklabels(), rotation=45, horizontalalignment='right')
p.set(
    xlabel = 'Speed (mph)',
    title = 'Distribution of observed (biased) speeds'
);