# Probability density functions

In [None]:
from typing import Tuple

In [None]:
import numpy as np
import pandas as pd
from scipy import stats

In [None]:
# plotting
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()

In [None]:
# worksheet
from IPython.core.pylabtools import figsize
figsize(11, 6)

In [None]:
import sys
sys.path.append('lib')

In [None]:
from cdf import Cdf
import brfss
import nsfg

I'll start with the data from the BRFSS again.

In [None]:
df = brfss.read_brfss()

Here are the mean and standard deviation of heights in cm.

In [None]:
df.groupby('sex')['height'].aggregate(['mean', 'std'])

In [None]:
female_heights = df.query('sex == 2').height.dropna()

In [None]:
mean, std = female_heights.mean(), female_heights.std()
mean, std

In [None]:
def render_pdf(dist: stats.rv_continuous, nstd=3, npoints=100):
    mu = dist.mean()
    std = dist.std()
    xs = np.linspace(np.floor(mu - nstd * std), np.ceil(mu + nstd * std), npoints)
    ys = dist.pdf(xs)
    return (xs, ys)

In [None]:
dist = stats.norm(loc=mean, scale=std)

In [None]:
xs, ys = render_pdf(dist)

In [None]:
plt.plot(
    xs,
    ys
);
plt.xlabel('x')
plt.ylabel('PDF');
plt.axvline(dist.mean(), color='darkred', linestyle='--')
plt.xlim((140, 186,));

`stats.norm` returns a Pdf object that represents the normal distribution with the given parameters.

`pdf` returns a probability density, which doesn't mean much by itself.

In [None]:
dist.pdf(mean + std)

Using a sample from the actual distribution, we can estimate the PDF using Kernel Density Estimation (KDE).

If you run this a few times, you'll see how much variation there is in the estimate.

In [None]:
xs, ys = render_pdf(dist)
sample = dist.rvs(500)
kde = stats.gaussian_kde(sample)
# you could check the ranges of the sample

In [None]:
plt.plot(
    xs,
    ys,
    label = 'Normal'
);
plt.plot(
    xs,
    kde.evaluate(xs),
    label = 'sample KDE'
)
plt.xlabel('x')
plt.ylabel('PDF')
plt.title('A normal PDF of adult female height in the US, and the kernel density estimate of a sample with n=500')
plt.legend(loc='upper right');

## Moments

Raw moments are just sums of powers.

In [None]:
def moment(xs: np.array, k: int) -> np.float64:
    return np.sum(xs**k) / len(xs)

The first raw moment is the mean.  The other raw moments don't mean much.

In [None]:
for k in [1, 2, 3]:
    print(f'moment({k}) = {moment(female_heights, k):0.2f}')

In [None]:
def mean(xs: np.array):
    return moment(xs, 1)

mean(female_heights)

The central moments are powers of distances from the mean.

In [None]:
def central_moment(xs, k):
    mu = moment(xs, 1)
    return np.sum((xs - mu) ** k) / len(xs)

The first central moment is approximately 0.  The second central moment is the variance.

In [None]:
for k in [1, 2, 3]:
    print(f'central moment({k}) = {central_moment(female_heights, k):0.2f}')

In [None]:
def var(xs):
    return central_moment(xs, 2)

var(female_heights)

The standardized moments are ratios of central moments, with powers chosen to make the dimensions cancel.

In [None]:
def standardized_moment(xs, k):
    var = central_moment(xs, 2)
    std = np.sqrt(var)
    return central_moment(xs, k) / std**k

The third standardized moment is skewness.

In [None]:
for k in [1, 2, 3]:
    print(f'standardized moment({k}) = {standardized_moment(female_heights, k):0.2f}')

In [None]:
def skewness(xs):
    return standardized_moment(xs, 3)

skewness(female_heights)

Normally a negative skewness indicates that the distribution has a longer tail on the left.  In that case, the mean is usually less than the median.

But in this case the mean is greater than the median, which indicates skew to the right.

In [None]:
mean(female_heights), np.median(female_heights)

Because the skewness is based on the third moment, it is not robust; that is, it depends strongly on a few outliers.  Pearson's median skewness is more robust.

In [None]:
def pearson_median_skewness(xs: np.array) -> np.float64:
    """
    Computes the Pearson median skewness.
    """
    median = np.median(xs)
    mean = moment(xs, 1)
    std = np.sqrt(central_moment(xs, 2))
    return 3 * (mean - median) / std

Pearson's skewness is positive, indicating that the distribution of female heights is slightly skewed to the right.

In [None]:
pearson_median_skewness(female_heights)

## Birth weights

Let's look at the distribution of birth weights again.

In [None]:
df = nsfg.read_live_fem_preg()

In [None]:
birth_weights = df.totalwgt_lb.dropna()

Based on KDE, it looks like the distribution is skewed to the left.

In [None]:
type(kde)

In [None]:
def estimate_pdf(sample: np.array, sz=101) -> Tuple[np.array, np.array]:
    kde = stats.gaussian_kde(sample)
    low = min(sample)
    high = max(sample)
    xs = np.linspace(low, high, sz)
    return (xs, kde.evaluate(xs),)

In [None]:
xs, ys = estimate_pdf(birth_weights)

In [None]:
p = sns.lineplot(
    x=xs,
    y=ys
);
p.set(
    xlabel = 'Birth weight (lbs)',
    ylabel = 'PDF',
    title = 'KDE estimate for the density of birth weights'
);

The mean is less than the median, which is consistent with left skew.

In [None]:
birth_weights.mean(), np.median(birth_weights)

And both ways of computing skew are negative, which is consistent with left skew.

In [None]:
skewness(birth_weights), pearson_median_skewness(birth_weights)

## Adult weights

Now let's look at adult weights from the BRFSS.  The distribution looks skewed to the right.

In [None]:
df = brfss.read_brfss()

In [None]:
adult_weights = df.weight.dropna()

In [None]:
xs, ys = estimate_pdf(adult_weights)

In [None]:
p = sns.lineplot(
    x=xs,
    y=ys
);
p.set(
    xlabel = 'Adult weight (lbs)',
    ylabel = 'PDF',
    title = 'KDE estimate for the density of adult weights'
);

The mean is greater than the median, which is consistent with skew to the right.

In [None]:
adult_weights.mean(), np.median(adult_weights)

And both ways of computing skewness are positive.

In [None]:
skewness(adult_weights), pearson_median_skewness(adult_weights)

## Exercises

In [None]:
def interpolate_sample(df, log_upper=6.0):
    """Makes a sample of log10 household income.

    Assumes that log10 income is uniform in each range.

    df: DataFrame with columns income and freq
    log_upper: log10 of the assumed upper bound for the highest range

    returns: NumPy array of log10 household income
    """
    # compute the log10 of the upper bound for each range
    df['log_upper'] = np.log10(df.income)

    # get the lower bounds by shifting the upper bound and filling in
    # the first element
    df['log_lower'] = df.log_upper.shift(1)
    df.loc[0, 'log_lower'] = 3.0

    # plug in a value for the unknown upper bound of the highest range
    df.loc[41, 'log_upper'] = log_upper
    
    # use the freq column to generate the right number of values in
    # each range
    arrays = []
    for _, row in df.iterrows():
        vals = np.linspace(row.log_lower, row.log_upper, row.freq)
        arrays.append(vals)

    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample


In [None]:
income_df = pd.read_feather('data/household_incomes.feather')

In [None]:
log_sample = interpolate_sample(income_df, log_upper=6.0)

In [None]:
log_cdf = Cdf.from_seq(log_sample)

In [None]:
p = sns.ecdfplot(
    log_sample
);
p.set(
    xlabel = 'Household income (log $)',
    ylabel = 'CDF'
);

In [None]:
sample = np.power(10, log_sample)

In [None]:
p = sns.ecdfplot(
    sample
);
p.set(
    xlabel = 'Household income',
    ylabel = 'CDF'
);

Compute the median, mean, skewness and Pearson’s skewness of the resulting sample. What fraction of households report a taxable income below the mean? How do the results depend on the assumed upper bound?

In [None]:
sample.mean(), np.median(sample)

In [None]:
skewness(sample), pearson_median_skewness(sample)

In [None]:
cdf = Cdf.from_seq(sample)

In [None]:
# Solution

# About 66% of the population makes less than the mean

cdf.prob(sample.mean())

All of this is based on an assumption that the highest income is one million dollars, but that's certainly not correct.  What happens to the skew if the upper bound is 10 million?

Without better information about the top of this distribution, we can't say much about the skewness of the distribution.