# Modelling Distributions

In [None]:
from typing import Tuple

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
import sys
sys.path.append('lib')

In [None]:
import nsfg
import babyboom
import population
import compstats
from cdf import Cdf

In [None]:
# plotting
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()

In [None]:
# worksheet
from IPython.core.pylabtools import figsize
figsize(11, 6)

In [None]:
# some colours
LIGHT_BLUE = '#348ABD'
PURPLE = '#A60628'
DARK_GREEN = '#467821'
colours = [LIGHT_BLUE, PURPLE, DARK_GREEN]

## Exponential distribution

Here's what the exponential CDF looks like with a range of parameters.

In [None]:
figsize(8, 6)
params = (0.5, 1, 2)
x = np.linspace(0, 3, 100)
for color, p in zip(colours, params):
    plt.plot(
        x,
        stats.expon.cdf(x, scale=1/p),
        color=color,
        label=f'$\lambda={p}$'
    )
plt.xlabel('x')
plt.ylabel('CDF')
plt.title('Exponential CDF')
plt.legend(loc='lower right');

Here's the distribution of interarrival times from a dataset of birth times.

In [None]:
df = babyboom.read_baby_boom()

In [None]:
df.head()

`diffs` is the difference between consecutive birth times

The following plots the cdf distribution of these interarrival times

In [None]:
ylabels = (
    '$CDF(x)$',
    '$1-CDF(x)$',
)
# axs is a 1x2 array of plot areas
fig, axs = plt.subplots(
    nrows=1,
    ncols=2,
    figsize=(13, 5,)
)
sns.ecdfplot(
    df.minutes.diff(),
    ax = axs[0]
)
sns.ecdfplot(
    df.minutes.diff(),
    ax = axs[1],
    complementary=True,
    # log scale on y axis only
    log_scale = (False, True,)
);
for i, ylabel in enumerate(ylabels):
    axs[i].set_xlabel('minutes')
    axs[i].set_ylabel(ylabel)
axs[1].set_ylim((0.01, 1,))
fig.suptitle('CDF of interarrival times (left) and CCDF on a log-y scale (right)');

Note what the CCDF looks like on a log-y scale.  A straight line is consistent with an exponential distribution.

If you plot the complementary CDF (CCDF) of a dataset that you think is exponential, you expect to see a function like:

$$
y \approx exp(-\lambda x)
$$

Taking the log of both sides yields

$$
log(y) \approx -\lambda x
$$

So on a log-y scale the CCDF is a straight line with slope $-\lambda$.

It is not exactly straight, which indicates that the exponential distribution is not a perfect model for this data. Most likely the underlying assumption—that a birth is equally likely at any time of day—is not exactly true. Nevertheless, it might be reasonable to model this dataset with an exponential distribution. With that simplification, we can summarize the distribution with a single parameter.

The parameter, $\lambda$, can be interpreted as a rate; that is, the number of events that occur, on average, in a unit of time. In this example, 44 babies are born in 24 hours, so the rate is $\lambda = 0.0306$ births per minute. The mean of an exponential distribution is $1/\lambda$, so the mean time between births is 32.7 minutes.

In [None]:
nminutes = np.ceil(df.minutes.iloc[-1])
nbirths = len(df)
print(f'{nbirths} births in {np.ceil(nminutes/60)} hours gives a rate of {nbirths/nminutes:0.3f} births per minute')

## Normal distribution

Here's what the normal CDF looks like with a range of parameters.

In [None]:
mus = [1, 2, 3]
sigmas = [0.5, 0.4, 0.3]
x = np.linspace(-1, 4, 100)
for mu, sigma in zip(mus, sigmas):
    plt.plot(
        x,
        stats.norm.cdf(x, loc=mu, scale=sigma),
        label=f'$\mathcal{{N}}(\mu={mu}, \sigma={sigma})$'
    )
plt.xlabel('x')
plt.ylabel('CDF')
plt.title('CDF of normal distributions with a range of parameters')
plt.legend(loc='upper left');
    

I'll use a normal model to fit the distribution of birth weights from the NSFG.

In [None]:
preg = nsfg.read_live_fem_preg()
weights = preg.totalwgt_lb.dropna()

Here's the observed CDF and the model.  The model fits the data well except in the left tail.

In [None]:
stats.trim_mean(weights, proportiontocut=0.01)

We need to trim both the mean and the variance so we use `compstats`

In [None]:
mu, var = compstats.trimmed_mean_var(weights, p=0.01)

In [None]:
print(f'Mean: {mu:0.2f}, Var: {var:0.2f}')

In [None]:
len(Cdf.from_seq(weights).probs(weights))

In [None]:
x = np.arange(0, 17)
df = pd.DataFrame(dict(
    weight=x,
    data=Cdf.from_seq(weights).probs(x),
    model=stats.norm(loc=mu, scale=np.sqrt(var)).cdf(x)
))
df.head()

In [None]:
df_long = df.melt(
    id_vars = 'weight',
    value_vars = ['data', 'model'],
    value_name = 'CDF',
    var_name = 'Scenario'
)
df_long.head()

In [None]:
p = sns.lineplot(
    data = df_long,
    x = 'weight',
    y = 'CDF',
    hue = 'Scenario'
)
p.set(
    xlabel = 'Birth weight (lbs)',
    title = 'Birth weights'
);

## Normal Probability Plot

1. Sort the values in the sample.
2. From a standard normal distribution $\mathcal{N}(\mu = 0, \sigma = 1)$, generate a random sample with the same size as the sample, and sort it.
3. Plot the sorted values from the sample versus the random values.

A normal probability plot is a visual test for normality.  The following example shows that if the data are actually from a normal distribution, the plot is approximately straight.

In [None]:
def normal_qq(ys: np.array) -> Tuple[np.array, np.array]:
    """Generates data for a normal probability plot.

    ys: sequence of values
    jitter: float magnitude of jitter added to the ys 

    returns: numpy arrays xs, ys
    """
    xs = np.random.normal(0, 1, len(ys))
    xs.sort()
    ys = ys.copy()
    ys.sort()
    return xs, ys

In [None]:
# start with a sample that is normal
n = 1000
sample = stats.norm(loc=0, scale=1).rvs(n)

In [None]:
xs, ys = normal_qq(sample)
p = sns.lineplot(
    x=xs,
    y=ys
);
p.set(
    xlabel = 'standard normal sample',
    ylabel = 'sample values',
    title = 'Normal probability plot',
    xlim = (-4, 4)
);

Here's the normal probability plot for birth weights, showing that the lightest babies are lighter than we expect from the normal mode, and the heaviest babies are heavier.

In [None]:
def fit_line(xs: np.array, intercept, slope) -> np.array:
    """Fits a straight line fit to the given data.

    xs: sequence of x (in sorted order)

    returns: a numpy array
    """
    return intercept + slope * xs

In [None]:
std = np.sqrt(var)
xs, ys = normal_qq(weights.values)
# are fitted values are a line passing through the mean with a slope of sigma
fitted_ys = fit_line(xs, mu, std)

In [None]:
p = sns.lineplot(
    x=xs,
    y=ys,
    label='all live'
);
p.set(
    xlabel = 'standard normal sample',
    ylabel = 'Birth weight (lbs)',
    title = 'Normal probability plot',
    xlim = (-5, 4)
);
plt.plot(xs, fitted_ys, label='fitted');
plt.legend();

If we suspect that the deviation in the left tail is due to preterm babies, we can check by selecting only full term births.

In [None]:
term_weights = preg.query('prglngth > 36').totalwgt_lb.dropna()

Now the deviation in the left tail is almost gone, but the heaviest babies are still heavy.

In [None]:
term_xs, term_ys = normal_qq(term_weights.values)
xlims = np.array([-4, 4])
fitted_ys = fit_line(xlims, mu, std)
plt.plot(xs, ys, label = 'all live')
plt.plot(term_xs, term_ys, label = 'full term')
plt.plot(xlims, fitted_ys, label='fitted', linestyle='dashed');
plt.xlabel('standard deviation from the mean')
plt.ylabel('Birth weight (lbs)')
plt.title('Normal probability plot')
plt.legend(loc='upper left');

## Lognormal model

If the logarithms of a set of values have a normal distribution, the values have a lognormal distribution. The CDF of the lognormal distribution is the same as the CDF of the normal distribution, with $log \space x$ substituted for x.

$$
CDF_{lognormal}(x) = CDF_{normal}(log(x))
$$

As an example of a lognormal distribution, we'll look at adult weights from the BRFSS.

In [None]:
df = pd.read_feather('data/brfss.feather')

In [None]:
weights = df.weight.dropna()

In [None]:
weights.mean()

In [None]:
np.median(weights)

In [None]:
p = sns.kdeplot(
    x = weights,
    label = 'Adult weight'
)
p.set(
    xlabel = 'weight (kg)',
    ylabel = 'PDF',
    title = 'Estimated PDF of adult weight data from the BRFSS'
);
p.axvline(
    weights.mean(),
    color='darkred',
    linestyle='--',
    label = 'Mean',
    linewidth=0.8
);
p.axvline(
    np.median(weights),
    color='darkgreen',
    linestyle='--',
    label = 'Median',
    linewidth=0.8
);
p.legend(loc='upper right');

The distribution appears skewed to the right. Sure enough, the mean, 79.0, is bigger than the median, 77.3. The sample skewness is 1.1 and Pearson’s median skewness is 0.26.

The sign of the skewness coefficient indicates whether the distribution skews left or right, but other than that, they are hard to interpret. Sample skewness is less robust; that is, it is more susceptible to outliers. As a result it is less reliable when applied to skewed distributions, exactly when it would be most relevant.

Pearson’s median skewness is based on a computed mean and variance, so it is also susceptible to outliers, but since it does not depend on a third moment, it is somewhat more robust.

In [None]:
stats.skew(weights)

In [None]:
compstats.pearson_median_skewness(weights)

The following function estimates the parameters of a normal distribution and plots the data and a normal model.

In [None]:
def plot_normal_model(weights: np.array, label: str, title: str):
    mean, var = compstats.trimmed_mean_var(weights)
    std = np.sqrt(var)
    cdf = Cdf.from_seq(weights)
    p = sns.lineplot(
        x = cdf.xs,
        y = cdf.ps,
        label = 'data'
    );
    p.set(
        xlabel = label,
        ylabel = 'CDF',
        title = title
    );
    plt.plot(
        cdf.xs,
        stats.norm(loc=mean, scale=std).cdf(cdf.xs),
        label = 'model'
    )
    plt.legend(loc='lower right');

Here's the distribution of adult weights and a normal model, which is not a very good fit.

In [None]:
plot_normal_model(
    weights, 
    label='adult weight (kg)',
    title = 'Distribution of adult weights on a linear scale'
)

Here's the distribution of adult weight and a lognormal model, plotted on a log-x scale.  The model is a better fit for the data, although the heaviest people are heavier than the model expects.

In [None]:
plot_normal_model(
    np.log10(weights), 
    label='adult weight (log10 kg)',
    title = 'Distribution of adult weights on a log scale'
)

The following function generates a normal probability plot.

In [None]:
def normal_qq_plot(values: np.array, label: str, title: str):
    """Generates a normal probability plot.

    weights: sequence
    """
    mu, var = compstats.trimmed_mean_var(values, p=0.01)
    std = np.sqrt(var)
    xs, ys = normal_qq(values)
    xlims = (-5, 5)
    fitted_ys = fit_line(np.array(xlims), mu, std)
    plt.plot(xs, ys, label = 'data')
    plt.plot(xlims, fitted_ys, label='fitted', linestyle='dashed')
    plt.xlabel('z')
    plt.xlim(xlims)
    plt.ylabel(label)
    plt.title(title)
    plt.legend(loc='upper left');

In [None]:
weights = weights[weights <= 200]

When we generate a normal probability plot with adult weights, we can see clearly that the data deviate from the model systematically.

In [None]:
normal_qq_plot(
    weights.values,
    'weights (kg)',
    'Adult weight (normal plot)'
)

If we make a normal probability plot with log weights, the model fit the data well except in the tails, where the heaviest people exceed expectations.

In [None]:
normal_qq_plot(
    np.log10(weights.values),
    'weights (log10 kg)',
    'Adult weight (log normal plot)'
)

In [None]:
weights.std()

## Pareto distribution

The Pareto distribution is named after the economist Vilfredo Pareto, who used it to describe the distribution of wealth. Since then, it has been used to describe phenomena in the natural and social sciences including sizes of cities and towns, sand particles and meteorites, forest fires and earthquakes. The CDF of the Pareto distribution is:

$$
CDF(x) = 1 - (\frac{x}{x_m})^{-\alpha}
$$

The parameters xm and α determine the location and shape of the distribution. $x_m$ is the minimum possible value.

Here's what the Pareto CDF looks like with a range of parameters.

In [None]:
xmin = 0.5
xs = np.linspace(0, 10, 100)
params = [2, 1, 0.5]
for alpha in params:
    plt.plot(
        xs,
        stats.pareto.cdf(xs, scale=xmin, b=alpha),
        label=f'a={alpha}'
    )
plt.xlabel('x')
plt.ylabel('CDF')
plt.ylim(0, 1)
plt.title('CDFs of Pareto distributions with different parameters.')
plt.legend(loc='lower right');

If you plot the CCDF of a sample from a Pareto distribution on a linear scale, you expect to see a function like:

$$
y \approx (\frac{x}{x_m})^{-\alpha}
$$

Taking the log of both sides

$$
log(y) \approx -\alpha(log(x) - log(x_m))
$$

so if you plot $log(y)$ verses $log(x)$, it should look like a straight line with slope $-\alpha$ and intercept $\alpha \space log(x_m)$

The distribution of populations for cities and towns is sometimes said to be Pareto-like.

In [None]:
def pareto_cdf(xmin, alpha, low, high, n=50):
    """Generates sequences of xs and ps for a Pareto CDF.

    xmin: parameter
    alpha: parameter
    low: float
    high: float
    n: number of points to render

    returns: numpy arrays (xs, ps)
    """
    if low < xmin:
        low = xmin
    xs = np.linspace(low, high, n)
    ps = stats.pareto.cdf(xs, scale=xmin, b=alpha)
    return xs, ps

In [None]:
pops = population.read_population()

Here's the distribution of population for cities and towns in the U.S., along with a Pareto model.  The model fits the data well in the tail.

In [None]:
p = sns.ecdfplot(
    x = pops,
    complementary = True,
    log_scale = (True, True),
    label = 'data'
);

p.set(
    xlabel = 'log10 population',
    ylabel = 'CCDF',
    title = 'CCDFs of city and town populations, on a log-log scale'
);

The lognormal model might be a better fit for this data (as is often the case for things that are supposed to be Pareto).

Here's a normal probability plot for the log-populations.  The model fits the data well except in the right tail, where the biggest cities are bigger than expected.

In [None]:
normal_qq_plot(
    np.log10(pops.values),
    'log10 population',
    'Number of cities/towns (log10)'
)

## Random variates

When we have an analytic CDF, we can sometimes invert it to generate random values.  The following function generates values from an exponential distribution.

$$
p = 1 - e^{\lambda x}
$$

$$
x = \frac{-log(1-p)}{\lambda}
$$

In [None]:
import random

def expovariate(lam):
    p = random.random()
    # inverse of the exponential function
    x = -np.log(1-p) / lam
    return x

We can test it by generating a sample.

In [None]:
t = [expovariate(lam=2) for _ in range(1000)]

And plotting the CCDF on a log-y scale.

In [None]:
p = sns.ecdfplot(
    x=t,
    complementary = True,
    # log-y scale
    log_scale = (False, True,)
);
p.set(
    xlabel = 'Exponential variate',
    ylabel = 'CCDF'
);

`stats` does this sort of thing much more efficiently

In [None]:
t = stats.expon.rvs(size=1000)

In [None]:
p = sns.ecdfplot(
    x=t,
    complementary = True,
    # log-y scale
    log_scale = (False, True,)
);
p.set(
    xlabel = 'Exponential variate',
    ylabel = 'CCDF'
);

A straight line is consistent with an exponential distribution.

As an exercise, write a function that generates a Pareto variate.

## Exercises

**Exercise:** In the BRFSS (see Section 5.4), the distribution of heights is roughly normal with parameters µ = 178 cm and σ = 7.7 cm for men, and µ = 163 cm and σ = 7.3 cm for women.

In order to join Blue Man Group, you have to be male between 5’10” and 6’1” (see http://bluemancasting.com). What percentage of the U.S. male population is in this range? Hint: use `scipy.stats.norm.cdf`.

`scipy.stats` contains objects that represent analytic distributions

In [None]:
df = pd.read_feather('data/brfss.feather')

In [None]:
df.dropna(subset=['weight'], inplace=True)

In [None]:
df.groupby('sex')['height'].aggregate(['mean', 'std'])

For example <tt>scipy.stats.norm</tt> represents a normal distribution.

In [None]:
mu = 178
sigma = 7.7
dist = stats.norm(loc=mu, scale=sigma)
type(dist)

A "frozen random variable" can compute its mean and standard deviation.

In [None]:
dist.mean(), dist.std()

It can also evaluate its CDF.  How many people are more than one standard deviation below the mean?  About 16%

In [None]:
dist.cdf(mu-sigma)

How many people are between 5'10" and 6'1"?

In [None]:
# Solution

low = dist.cdf(177.8)    # 5'10"
high = dist.cdf(185.4)   # 6'1"
print(f'Low: {low:0.2f}, High: {high:0.2f}, In between {high-low:0.2f}')

**Exercise:** To get a feel for the Pareto distribution, let’s see how different the world would be if the distribution of human height were Pareto. With the parameters $x_m = 1m$ and $\alpha = 1.7$, we get a distribution with a reasonable minimum, 1 m, and median, 1.5 m.

Plot this distribution. What is the mean human height in Pareto world? What fraction of the population is shorter than the mean? If there are 7 billion people in Pareto world, how many do we expect to be taller than 1 km? How tall do we expect the tallest person to be?

`scipy.stats.pareto` represents a pareto distribution.  In Pareto world, the distribution of human heights has parameters $\alpha=1.7$ and $x_{min}=1 \space meter$.  So the shortest person is 100 cm and the median is 150.

In [None]:
alpha = 1.7
# meter
xmin = 1  
dist = stats.pareto(b=alpha, scale=xmin)
dist.median()

What is the mean height in Pareto world?

In [None]:
dist.mean()

What fraction of people are shorter than the mean?

In [None]:
dist.cdf(dist.mean())

Out of 7 billion people, how many do we expect to be taller than 1 km?  You could use <tt>dist.cdf</tt> or <tt>dist.sf</tt>.

In [None]:
# Solution

(1 - dist.cdf(1000)) * 7e9, dist.sf(1000) * 7e9

How tall do we expect the tallest person to be?

In [None]:
# One way to solve this is to search for a height that we
# expect one person out of 7 billion to exceed.

# It comes in at roughly 600 kilometers.

dist.sf(600000) * 7e9            

In [None]:
# Another way is to use `ppf`, which evaluates the "percent point function", which
# is the inverse CDF.  So we can compute the height in meters that corresponds to
# the probability (1 - 1/7e9).

dist.ppf(1 - 1/7e9)

**Exercise:** The [Weibull](http://wikipedia.org/wiki/Weibull_distribution) distribution is a generalization of the exponential distribution that comes up in failure analysis. Its CDF is

$$
\mathrm{CDF}(x) = 1 - \exp[-(x / \lambda)^k]
$$ 

Can you find a transformation that makes a Weibull distribution look like a straight line? What do the slope and intercept of the line indicate?

Use `random.weibullvariate` to generate a sample from a Weibull distribution and use it to test your transformation.

Generate a sample from a Weibull distribution and plot it using a transform that makes a Weibull distribution look like a straight line.

In [None]:
sample = [random.weibullvariate(2, 1) for _ in range(1000)]

In [None]:
cdf = Cdf.from_seq(sample)

In [None]:
def transform_weibull(cdf: Cdf) -> Tuple[np.array, np.array]:
    '''
    Transforms the CDF of a weibull distribution to look like a straight line
    '''
    xs = np.delete(cdf.xs, -1)
    ps = np.delete(cdf.ps, -1)
    ps = -np.log(1-ps)
    return xs, ps

In [None]:
xs, ps = transform_weibull(cdf)

In [None]:
p = sns.lineplot(
    x=xs,
    y=ps
);
p.set(
    xlabel = 'Weibull variate',
    ylabel = 'CCDF'
);

**Exercise:** For small values of `n`, we don’t expect an empirical distribution to fit an analytic distribution exactly. One way to evaluate the quality of fit is to generate a sample from an analytic distribution and see how well it matches the data.

For example, earlier we plotted the distribution of time between births and saw that it is approximately exponential. But the distribution is based on only 44 data points. To see whether the data might have come from an exponential distribution, generate 44 values from an exponential distribution with the same mean as the data, about 33 minutes between births.

Plot the distribution of the random values and compare it to the actual distribution. You can use random.expovariate to generate the values.

In [None]:
df = babyboom.read_baby_boom()
diffs = df.minutes.diff()
ccdf = Cdf.from_seq(diffs).complement()

In [None]:
n = len(diffs)
lam = 44/24/60

In [None]:
sample = np.array([random.expovariate(lam) for _ in range(n)])
model = Cdf.from_seq(sample).complement()

In [None]:
# only occasionally with 1/lam and the mean of the sample be close
print(f'n={n}, lambda={1/lam:0.4f}, Mean={sample.mean():0.2f}')

In [None]:
plt.plot(
    ccdf.xs,
    ccdf.ps,
    label = 'data'
)
plt.plot(
    model.xs,
    model.ps,
    label = 'model'
);
plt.xlabel('Time between births (minutes)')
plt.ylabel('CCDF')
plt.legend();