# Statistics (<font color=#0099ff>scipy.stats</font>)

## Random variables

In [None]:
from __future__ import print_function

from scipy import stats

from scipy.stats import norm

### Getting help

First of all, all distributions are accompanied with help functions. To obtain just some basic information, we print the relevant docstring: `print(stats.norm.__doc__)`.

In [None]:
print('bounds of distribution lower: %s, upper: %s' % (norm.a, norm.b))

In [None]:
dir(norm)

In [None]:
rv = norm()
dir(rv)  # reformatted

In [None]:
dist_continu = [d for d in dir(stats) if
                isinstance(getattr(stats, d), stats.rv_continuous)]
dist_discrete = [d for d in dir(stats) if
                 isinstance(getattr(stats, d), stats.rv_discrete)]
print('number of continuous distributions: %d' % len(dist_continu))

In [None]:
print('number of discrete distributions:   %d' % len(dist_discrete))

### Common methods

In [None]:
norm.cdf(0)

In [None]:
norm.cdf([-1., 0, 1])

In [None]:
import numpy as np
norm.cdf(np.array([-1., 0, 1]))

In [None]:
norm.mean(), norm.std(), norm.var()

In [None]:
norm.stats(moments="mv")

In [None]:
norm.ppf(0.5)

In [None]:
norm.rvs(size=3)

In [None]:
np.random.seed(1234)

In [None]:
norm.rvs(size=5, random_state=1234)

In [None]:
norm.rvs(5)

### Shifting and scaling

In [None]:
norm.stats(loc=3, scale=4, moments="mv")

In [None]:
from scipy.stats import expon
expon.mean(scale=3.)

In [None]:
from scipy.stats import uniform
uniform.cdf([0, 1, 2, 3, 4, 5], loc=1, scale=4)

In [None]:
np.mean(norm.rvs(5, size=500))

### Shape parameters

For instance, the gamma distribution with density

$$\gamma(x,a)=\frac{\lambda\big(\lambda x\big)^{a-1}}{\Gamma\big(a\big)}e^{-\lambda x},$$

requires the shape parameter $a$. Observe that setting $\lambda$ can be obtained by setting the `scale` keyword to $1/\lambda$.

In [None]:
from scipy.stats import gamma

gamma.numargs

In [None]:
gamma.shapes

In [None]:
gamma(1, scale=2.).stats(moments="mv")

In [None]:
gamma(a=1, scale=2.).stats(moments="mv")

### Freezing a distribution

In [None]:
rv = gamma(1, scale=2.)

In [None]:
rv.mean(), rv.std()

### Broadcasting

In [None]:
stats.t.isf([0.1, 0.05, 0.01], [[10], [11]])

In [None]:
stats.t.isf([0.1, 0.05, 0.01], 10)

In [None]:
stats.t.isf([0.1, 0.05, 0.01], 11)

In [None]:
stats.t.isf([0.1, 0.05, 0.01], [10, 11, 12])

### Specific points for discrete distributions

In [None]:
from scipy.stats import hypergeom
[M, n, N] = [20, 7, 12]

x = np.arange(4)*2
x

In [None]:
prb = hypergeom.cdf(x, M, n, N)
prb

In [None]:
hypergeom.ppf(prb, M, n, N)

## Building specific distributions

### Making a continuous distribution, i.e., subclassing `rv_continuous`

In [None]:
from scipy import stats
class deterministic_gen(stats.rv_continuous):
    def _cdf(self, x):
        return np.where(x < 0, 0., 1.)
    def _stats(self):
        return 0., 0., 0., 0.

deterministic = deterministic_gen(name="deterministic")
deterministic.cdf(np.arange(-3, 3, 0.5))

In [None]:
deterministic.pdf(np.arange(-3, 3, 0.5))

In [None]:
from scipy.integrate import quad
quad(deterministic.pdf, -1e-1, 1e-1)

In [None]:
quad(deterministic.pdf, -1e-3, 1e-3)  # warning removed

### Subclassing `rv_discrete`

In [None]:
help(stats.rv_discrete)

In [None]:
npoints = 20   # number of integer support points of the distribution minus 1
npointsh = npoints // 2
npointsf = float(npoints)
nbound = 4   # bounds for the truncated normal
normbound = (1+1/npointsf) * nbound   # actual bounds of truncated normal
grid = np.arange(-npointsh, npointsh+2, 1)   # integer grid
gridlimitsnorm = (grid-0.5) / npointsh * nbound   # bin limits for the truncnorm
gridlimits = grid - 0.5   # used later in the analysis
grid = grid[:-1]
probs = np.diff(stats.truncnorm.cdf(gridlimitsnorm, -normbound, normbound))
gridint = grid

normdiscrete = stats.rv_discrete(values=(gridint,
             np.round(probs, decimals=7)), name='normdiscrete')

print('mean = %6.4f, variance = %6.4f, skew = %6.4f, kurtosis = %6.4f' %
      normdiscrete.stats(moments='mvsk'))

In [None]:
nd_std = np.sqrt(normdiscrete.stats(moments='v'))

n_sample = 500
np.random.seed(87655678)   # fix the seed for replicability
rvs = normdiscrete.rvs(size=n_sample)
f, l = np.histogram(rvs, bins=gridlimits)
sfreq = np.vstack([gridint, f, probs*n_sample]).T
print(sfreq)

In [None]:
f2 = np.hstack([f[:5].sum(), f[5:-5], f[-5:].sum()])
p2 = np.hstack([probs[:5].sum(), probs[5:-5], probs[-5:].sum()])
ch2, pval = stats.chisquare(f2, p2*n_sample)

print('chisquare for normdiscrete: chi2 = %6.3f pvalue = %6.4f' % (ch2, pval))

## Analysing one sample

In [None]:
np.random.seed(282629734)
x = stats.t.rvs(10, size=1000)

### Descriptive statistics

In [None]:
print(x.min())   # equivalent to np.min(x)

In [None]:
print(x.max())   # equivalent to np.max(x)

In [None]:
print(x.mean())  # equivalent to np.mean(x)

In [None]:
print(x.var())   # equivalent to np.var(x))

In [None]:
m, v, s, k = stats.t.stats(10, moments='mvsk')
n, (smin, smax), sm, sv, ss, sk = stats.describe(x)

sstr = '%-14s mean = %6.4f, variance = %6.4f, skew = %6.4f, kurtosis = %6.4f'
print(sstr % ('distribution:', m, v, s ,k))

In [None]:
print(sstr % ('sample:', sm, sv, ss, sk))

### T-test and KS-test

In [None]:
print('t-statistic = %6.3f pvalue = %6.4f' %  stats.ttest_1samp(x, m))

In [None]:
tt = (sm-m)/np.sqrt(sv/float(n))  # t-statistic for mean
pval = stats.t.sf(np.abs(tt), n-1)*2  # two-sided pvalue = Prob(abs(t)>tt)
print('t-statistic = %6.3f pvalue = %6.4f' % (tt, pval))

In [None]:
print('KS-statistic D = %6.3f pvalue = %6.4f' % stats.kstest(x, 't', (10,)))

In [None]:
print('KS-statistic D = %6.3f pvalue = %6.4f' % stats.kstest(x, 'norm'))

In [None]:
d, pval = stats.kstest((x-x.mean())/x.std(), 'norm')
print('KS-statistic D = %6.3f pvalue = %6.4f' % (d, pval))

### Tails of the distribution

In [None]:
crit01, crit05, crit10 = stats.t.ppf([1-0.01, 1-0.05, 1-0.10], 10)
print('critical values from ppf at 1%%, 5%% and 10%% %8.4f %8.4f %8.4f' % (crit01, crit05, crit10))

In [None]:
print('critical values from isf at 1%%, 5%% and 10%% %8.4f %8.4f %8.4f' % tuple(stats.t.isf([0.01,0.05,0.10],10)))

In [None]:
freq01 = np.sum(x>crit01) / float(n) * 100
freq05 = np.sum(x>crit05) / float(n) * 100
freq10 = np.sum(x>crit10) / float(n) * 100
print('sample %%-frequency at 1%%, 5%% and 10%% tail %8.4f %8.4f %8.4f' % (freq01, freq05, freq10))

In [None]:
freq05l = np.sum(stats.t.rvs(10, size=10000) > crit05) / 10000.0 * 100
print('larger sample %%-frequency at 5%% tail %8.4f' % freq05l)

In [None]:
print('tail prob. of normal at 1%%, 5%% and 10%% %8.4f %8.4f %8.4f' %
      tuple(stats.norm.sf([crit01, crit05, crit10])*100))

In [None]:
quantiles = [0.0, 0.01, 0.05, 0.1, 1-0.10, 1-0.05, 1-0.01, 1.0]
crit = stats.t.ppf(quantiles, 10)
crit

In [None]:
n_sample = x.size
freqcount = np.histogram(x, bins=crit)[0]
tprob = np.diff(quantiles)
nprob = np.diff(stats.norm.cdf(crit))
tch, tpval = stats.chisquare(freqcount, tprob*n_sample)
nch, npval = stats.chisquare(freqcount, nprob*n_sample)
print('chisquare for t:      chi2 = %6.2f pvalue = %6.4f' % (tch, tpval))

In [None]:
print('chisquare for normal: chi2 = %6.2f pvalue = %6.4f' % (nch, npval))

In [None]:
tdof, tloc, tscale = stats.t.fit(x)
nloc, nscale = stats.norm.fit(x)
tprob = np.diff(stats.t.cdf(crit, tdof, loc=tloc, scale=tscale))
nprob = np.diff(stats.norm.cdf(crit, loc=nloc, scale=nscale))
tch, tpval = stats.chisquare(freqcount, tprob*n_sample)
nch, npval = stats.chisquare(freqcount, nprob*n_sample)
print('chisquare for t:      chi2 = %6.2f pvalue = %6.4f' % (tch, tpval))

In [None]:
print('chisquare for normal: chi2 = %6.2f pvalue = %6.4f' % (nch, npval))

### Special tests for normal distributions

In [None]:
print('normal skewtest teststat = %6.3f pvalue = %6.4f' % stats.skewtest(x))

In [None]:
print('normal kurtosistest teststat = %6.3f pvalue = %6.4f' % stats.kurtosistest(x))

In [None]:
print('normaltest teststat = %6.3f pvalue = %6.4f' % stats.normaltest(x))

In [None]:
print('normaltest teststat = %6.3f pvalue = %6.4f' %
      stats.normaltest((x-x.mean())/x.std()))

In [None]:
print('normaltest teststat = %6.3f pvalue = %6.4f' %
      stats.normaltest(stats.t.rvs(10, size=100)))

In [None]:
print('normaltest teststat = %6.3f pvalue = %6.4f' %
             stats.normaltest(stats.norm.rvs(size=1000)))

## Comparing two samples

### Comparing means

In [None]:
rvs1 = stats.norm.rvs(loc=5, scale=10, size=500)
rvs2 = stats.norm.rvs(loc=5, scale=10, size=500)
stats.ttest_ind(rvs1, rvs2)

In [None]:
rvs3 = stats.norm.rvs(loc=8, scale=10, size=500)
stats.ttest_ind(rvs1, rvs3)

### Kolmogorov-Smirnov test for two samples ks_2samp

In [None]:
stats.ks_2samp(rvs1, rvs2)

In [None]:
stats.ks_2samp(rvs1, rvs3)

## Kernel density estimation

### Univariate estimation

In [None]:
from scipy import stats
import matplotlib.pyplot as plt

x1 = np.array([-7, -5, 1, 4, 5], dtype=np.float)
kde1 = stats.gaussian_kde(x1)
kde2 = stats.gaussian_kde(x1, bw_method='silverman')

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(x1, np.zeros(x1.shape), 'b+', ms=20)  # rug plot
x_eval = np.linspace(-10, 10, num=200)
ax.plot(x_eval, kde1(x_eval), 'k-', label="Scott's Rule")
ax.plot(x_eval, kde2(x_eval), 'r-', label="Silverman's Rule")

plt.show()

In [None]:
def my_kde_bandwidth(obj, fac=1./5):
    """We use Scott's Rule, multiplied by a constant factor."""
    return np.power(obj.n, -1./(obj.d+4)) * fac

fig = plt.figure()
ax = fig.add_subplot(111)

ax.plot(x1, np.zeros(x1.shape), 'b+', ms=20)  # rug plot
kde3 = stats.gaussian_kde(x1, bw_method=my_kde_bandwidth)
ax.plot(x_eval, kde3(x_eval), 'g-', label="With smaller BW")

plt.show()

In [None]:
np.random.seed(12456)
x1 = np.random.normal(size=200)  # random data, normal distribution
xs = np.linspace(x1.min()-1, x1.max()+1, 200)

kde1 = stats.gaussian_kde(x1)
kde2 = stats.gaussian_kde(x1, bw_method='silverman')

fig = plt.figure(figsize=(8, 6))

ax1 = fig.add_subplot(211)
ax1.plot(x1, np.zeros(x1.shape), 'b+', ms=12)  # rug plot
ax1.plot(xs, kde1(xs), 'k-', label="Scott's Rule")
ax1.plot(xs, kde2(xs), 'b-', label="Silverman's Rule")
ax1.plot(xs, stats.norm.pdf(xs), 'r--', label="True PDF")

ax1.set_xlabel('x')
ax1.set_ylabel('Density')
ax1.set_title("Normal (top) and Student's T$_{df=5}$ (bottom) distributions")
ax1.legend(loc=1)

x2 = stats.t.rvs(5, size=200)  # random data, T distribution
xs = np.linspace(x2.min() - 1, x2.max() + 1, 200)

kde3 = stats.gaussian_kde(x2)
kde4 = stats.gaussian_kde(x2, bw_method='silverman')

ax2 = fig.add_subplot(212)
ax2.plot(x2, np.zeros(x2.shape), 'b+', ms=12)  # rug plot
ax2.plot(xs, kde3(xs), 'k-', label="Scott's Rule")
ax2.plot(xs, kde4(xs), 'b-', label="Silverman's Rule")
ax2.plot(xs, stats.t.pdf(xs, 5), 'r--', label="True PDF")

ax2.set_xlabel('x')
ax2.set_ylabel('Density')

plt.show()

In [None]:
from functools import partial

loc1, scale1, size1 = (-2, 1, 175)
loc2, scale2, size2 = (2, 0.2, 50)
x2 = np.concatenate([np.random.normal(loc=loc1, scale=scale1, size=size1),
                     np.random.normal(loc=loc2, scale=scale2, size=size2)])

x_eval = np.linspace(x2.min() - 1, x2.max() + 1, 500)

kde = stats.gaussian_kde(x2)
kde2 = stats.gaussian_kde(x2, bw_method='silverman')
kde3 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.2))
kde4 = stats.gaussian_kde(x2, bw_method=partial(my_kde_bandwidth, fac=0.5))

pdf = stats.norm.pdf
bimodal_pdf = pdf(x_eval, loc=loc1, scale=scale1) * float(size1) / x2.size + \
              pdf(x_eval, loc=loc2, scale=scale2) * float(size2) / x2.size

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)

ax.plot(x2, np.zeros(x2.shape), 'b+', ms=12)
ax.plot(x_eval, kde(x_eval), 'k-', label="Scott's Rule")
ax.plot(x_eval, kde2(x_eval), 'b-', label="Silverman's Rule")
ax.plot(x_eval, kde3(x_eval), 'g-', label="Scott * 0.2")
ax.plot(x_eval, kde4(x_eval), 'c-', label="Scott * 0.5")
ax.plot(x_eval, bimodal_pdf, 'r--', label="Actual PDF")

ax.set_xlim([x_eval.min(), x_eval.max()])
ax.legend(loc=2)
ax.set_xlabel('x')
ax.set_ylabel('Density')
plt.show()

### Multivariate estimation

In [None]:
def measure(n):
    """Measurement model, return two coupled measurements."""
    m1 = np.random.normal(size=n)
    m2 = np.random.normal(scale=0.5, size=n)
    return m1+m2, m1-m2

m1, m2 = measure(2000)
xmin = m1.min()
xmax = m1.max()
ymin = m2.min()
ymax = m2.max()

X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
values = np.vstack([m1, m2])
kernel = stats.gaussian_kde(values)
Z = np.reshape(kernel.evaluate(positions).T, X.shape)

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)

ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
          extent=[xmin, xmax, ymin, ymax])
ax.plot(m1, m2, 'k.', markersize=2)

ax.set_xlim([xmin, xmax])
ax.set_ylim([ymin, ymax])

plt.show()

### Multiscale Graph Correlation (MGC)

In [None]:
plt.style.use('classic')
from scipy.stats import multiscale_graphcorr

def mgc_plot(x, y, sim_name, mgc_dict=None, only_viz=False,
             only_mgc=False):
    """Plot sim and MGC-plot"""
    if not only_mgc:
        # simulation
        plt.figure(figsize=(8, 8))
        ax = plt.gca()
        ax.set_title(sim_name + " Simulation", fontsize=20)
        ax.scatter(x, y)
        ax.set_xlabel('X', fontsize=15)
        ax.set_ylabel('Y', fontsize=15)
        ax.axis('equal')
        ax.tick_params(axis="x", labelsize=15)
        ax.tick_params(axis="y", labelsize=15)
        plt.show()
    if not only_viz:
        # local correlation map
        plt.figure(figsize=(8,8))
        ax = plt.gca()
        mgc_map = mgc_dict["mgc_map"]
        # draw heatmap
        ax.set_title("Local Correlation Map", fontsize=20)
        im = ax.imshow(mgc_map, cmap='YlGnBu')
        # colorbar
        cbar = ax.figure.colorbar(im, ax=ax)
        cbar.ax.set_ylabel("", rotation=-90, va="bottom")
        ax.invert_yaxis()
        # Turn spines off and create white grid.
        for edge, spine in ax.spines.items():
            spine.set_visible(False)
        # optimal scale
        opt_scale = mgc_dict["opt_scale"]
        ax.scatter(opt_scale[0], opt_scale[1],
                   marker='X', s=200, color='red')
        # other formatting
        ax.tick_params(bottom="off", left="off")
        ax.set_xlabel('#Neighbors for X', fontsize=15)
        ax.set_ylabel('#Neighbors for Y', fontsize=15)
        ax.tick_params(axis="x", labelsize=15)
        ax.tick_params(axis="y", labelsize=15)
        ax.set_xlim(0, 100)
        ax.set_ylim(0, 100)
        plt.show()

np.random.seed(12345678)
x = np.linspace(-1, 1, num=100)
y = x + 0.3 * np.random.random(x.size)

mgc_plot(x, y, "Linear", only_viz=True)

In [None]:
stat, pvalue, mgc_dict = multiscale_graphcorr(x, y)
print("MGC test statistic: ", round(stat, 1))

In [None]:
print("P-value: ", round(pvalue, 1))

In [None]:
mgc_plot(x, y, "Linear", mgc_dict, only_mgc=True)

In [None]:
np.random.seed(12345678)
unif = np.array(np.random.uniform(0, 5, size=100))
x = unif * np.cos(np.pi * unif)
y = unif * np.sin(np.pi * unif) + 0.4 * np.random.random(x.size)

mgc_plot(x, y, "Spiral", only_viz=True)

In [None]:
stat, pvalue, mgc_dict = multiscale_graphcorr(x, y)
print("MGC test statistic: ", round(stat, 1))

In [None]:
print("P-value: ", round(pvalue, 1))

In [None]:
mgc_plot(x, y, "Spiral", mgc_dict, only_mgc=True)