<a href="https://colab.research.google.com/github/mikexcohen/Calculus_book/blob/main/exercises/ch18_statistics_exercises.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Calculus unraveled: Intuition, Proofs, and Python**
### Mike X Cohen (sincxpress.com)
#### https://github.com/mikexcohen/calculus_book
#### Code for Chapter 18 (Integration applications in statistics)

---

# About this code file:

### This notebook contains full code solutions to the exercises in this book chapter. There are many correct ways to solve the exercises; this notebook provides *a* solution, not *THE* solution. Please use this code as a starting point to continue exploring and experimenting with calculus concepts and visualizations.

## **Using the code without the book may lead to confusion or errors.**

#### This code was written in google-colab. The notebook may require some modifications if you use a different IDE.

In [None]:
# import libraries and define global settings
import numpy as np
import sympy as sym
import scipy.integrate as spi
import matplotlib.pyplot as plt

# new in this chapter
import sympy.stats
import scipy.stats as stats

# define global figure properties used for publication
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg') # display figures in vector format
plt.rcParams.update({'font.size':14,             # font size
                     'savefig.dpi':300,          # output resolution
                     'axes.titlelocation':'left',# title location
                     'axes.spines.right':False,  # remove axis bounding box
                     'axes.spines.top':False,    # remove axis bounding box
                     'lines.linewidth':2         # increase default line thickness
                     })

# Exercise 18.1: cdfs from pdfs

In [None]:
xx = np.linspace(-4,4,9001)
dx = xx[1]-xx[0]
sigma = 1

# pdf
pdf = 1/np.sqrt(2*np.pi*sigma**2) * np.exp( -xx**2/(2*sigma**2) )

# cdf via numpy
cdf_np = np.cumsum(pdf*dx)

# cdf via scipy.integrate
cdf_sp = spi.cumulative_simpson(pdf,dx=dx,initial=0)

# cdf via scipy.stats
cdf_st = stats.norm.cdf(xx)

# compare
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(xx,pdf*dx,'k')
axs[0].set(xlim=xx[[0,-1]],xlabel='$x$',ylabel='Probability density',title=r'$\bf{A}$)  pdf')

axs[1].plot(xx,cdf_np,'k',label='np.cumsum')
axs[1].plot(xx[::200],cdf_sp[::200],'o',color=[.5,.5,.5],markerfacecolor=[.9,.9,.9],label='spi.cumulative_')
axs[1].plot(xx[::200],cdf_st[::200],'x',color=[.5,.5,.5],markersize=8,label='stats.norm')
axs[1].set(xlim=xx[[0,-1]],xlabel='$x$',ylabel='Cumulative probability',title=r'$\bf{B}$)  cdfs')
axs[1].legend()

plt.tight_layout()
plt.savefig('stats_ex1.png')
plt.show()

# Exercise 18.2: non-analytic distribution

In [None]:
N = 2000
# individual datasets
data1 = stats.logistic.rvs(size=N//2)
data2 = stats.wald.rvs(2,size=N//2)

# combined dataset
data = np.zeros(N)
data[::2] = data1
data[1::2] = data2


# visualize
_,axs = plt.subplots(1,3,figsize=(14,4))

axs[0].plot(data,'ko',alpha=.3)
axs[0].set(xlabel='Data index',ylabel='Data value',title=r'$\bf{A}$)  Simulated data')


colors = [ [.5,.5,.5],[.7,.7,.7],'k']
markers = [ 'o','^',None ]

for idx,D in enumerate((data1,data2,data)):

  # extract the histogram (empirical distribution)
  heights,bins = np.histogram(D,bins='auto')
  binCenters = (bins[:-1]+bins[1:]) / 2

  # estimate the pdf/cdf
  pdfEst = heights / N
  cdfEst = np.cumsum(pdfEst)

  axs[1].plot(binCenters,pdfEst,color=colors[idx],marker=markers[idx],markerfacecolor='w')
  axs[1].set(xlim=bins[[0,-1]],xlabel='Data value',ylabel='Probability estimate',title=r'$\bf{B}$)  Histogram (estimated pdf)')

  axs[2].plot(binCenters,cdfEst,color=colors[idx],marker=markers[idx],markerfacecolor='w')
  axs[2].set(xlim=bins[[0,-1]],xlabel='Data value',ylabel='Cumulative probability',title=r'$\bf{C}$)  Empirical cdf')


axs[1].legend(['Data 1','Data 2','Full data'])
axs[2].legend(['Data 1','Data 2','Full data'])

plt.tight_layout()
plt.savefig('stats_ex2.png')
plt.show()

# Exercise 18.3: Triangular pdf from cdf

In [None]:
a = 1
b = 3
d = 4

xx = np.linspace(a-1,d+1,901)

# piece 1
cdf = np.zeros(len(xx))

# piece 2
whereX = (xx>a) & (xx<=b)
cdf[whereX] = (xx[whereX]-a)**2 / ((d-a)*(b-a))

# piece 3
whereX = (xx>b) & (xx<d)
cdf[whereX] = 1 - ( (d-xx[whereX])**2 / ((d-a)*(d-b)) )

# piece 4
whereX = xx>=d
cdf[whereX] = np.ones(np.sum(whereX))

# plot the cdf
plt.figure(figsize=(10,4))
plt.plot(xx,cdf,'k')

for p in [a,b,d]:
  plt.plot(p,cdf[np.argmin(abs(xx-p))],'ko',markersize=10,markerfacecolor=[.8,.8,.8])

plt.gca().set(xlim=xx[[0,-1]],xlabel='$x$',ylabel='Cumulative probability',title='Triangular cdf')
plt.show()

In [None]:
stats.triang??

In [None]:
# pdf by differentiating the cdf
pdf = np.append(0,np.diff(cdf))

# pdf via scipy
pdfS = stats.triang.pdf(xx,loc=a,c=(b-a)/(d-a),scale=d-a)
pdfS *= xx[1]-xx[0] # needs to be scaled by dx


# and plot
plt.figure(figsize=(9,3.5))
plt.plot(xx,pdf,'k',label=r'$p_d(x)$')
plt.plot(xx[::20],pdfS[::20],'kd',markerfacecolor=[.5,.5,.5],label=r'$p_s(x)$')
plt.plot(a,0,'ks',markersize=12,markerfacecolor='w',label=r'$a=%s$' %a)
plt.plot(b,pdf[np.argmin(abs(xx-b))],'ko',markersize=12,markerfacecolor=[.7,.7,.7],label=r'$b=%s$' %b)
plt.plot(d,0,'k^',markersize=12,markerfacecolor=[.3,.3,.3],label=r'$d=%s$' %d)

plt.legend()
plt.gca().set(xlim=xx[[0,-1]],xlabel='x',ylabel='Probability density')

plt.tight_layout()
plt.savefig('stats_ex3.png')
plt.show()

# Exercise 18.4: P-value calculations

In [None]:
zval = 2

# 1) from cdf
pval = 1 - stats.norm.cdf(zval)

# 2) create the pdf
N = 1000
zz = np.linspace(zval,50,N)
dx = zz[1]-zz[0]
pdf = stats.norm.pdf(zz)

# from pdf using np.sum
pFromPdf_np = np.sum(pdf*dx)

# 3) from pdf using spi
pFromPdf_sp = spi.trapezoid(pdf,dx=dx)

# print the results
print(f'From cdf     : {pval}')
print(f'From np.sum  : {pFromPdf_np}')
print(f'From spi.trap: {pFromPdf_sp}')

In [None]:
# e.g.,
f'{stats.norm.pdf(50):.40f}'

# Exercise 18.5: Empirical vs. analytic moments

In [None]:
# analytic pdf
xx = np.linspace(-5,5,555)
pdf = stats.norm.pdf(xx)
pdf /= np.max(pdf)

# empirical pdf estimate from random data
sampsize = 5000
data = stats.norm.rvs(size=sampsize)
heights,bins = np.histogram(data,bins=40)
binCenters = (bins[:-1]+bins[1:]) / 2
estPdf = heights / np.max(heights)


## make the plot
plt.figure(figsize=(6,4))
plt.plot(xx,pdf,'--',color=[.7,.7,.7],label='Analytic pdf')
plt.plot(binCenters,estPdf,'k',label='Empirical pdf')

plt.legend()
plt.gca().set(xlim=xx[[0,-1]],xlabel='x',ylabel='Probability (norm.)',title=r'$\bf{A}$)  Normal pdf')

plt.tight_layout()
plt.show()


## calculate moments and print table
momentsData = (np.mean(data),np.var(data),stats.skew(data),stats.kurtosis(data))
momentsPdf = stats.norm.stats(moments='mvsk')

# the js code below increases the font size to make the table match the figure better
# https://stackoverflow.com/questions/61957742/how-to-increase-font-size-of-google-colab-cell-output
from IPython.display import Javascript
display(Javascript('''for (rule of document.styleSheets[0].cssRules){if (rule.selectorText=='body') {rule.style.fontSize = '17px'; break}}'''))

print(f'Source:    Mean   :    Var.   :    Skew   :  Kurtosis')
print(f'-----------------------------------------------------')
print(f'  Data:   {momentsData[0]:5.2f}   :   {momentsData[1]:5.2f}   :   {momentsData[2]:5.2f}   :   {momentsData[3]:5.2f}')
print(f'  pdf :   {momentsPdf[0]:5.2f}   :   {momentsPdf[1]:5.2f}   :   {momentsPdf[2]:5.2f}   :   {momentsPdf[3]:5.2f}')

In [None]:
# analytic pdf
xx = np.linspace(.5,5,555)
pdf = stats.pareto.pdf(xx,b=5)
pdf /= np.max(pdf)

# empirical pdf estimate from random data
sampsize = 5000
data = stats.pareto.rvs(b=5,size=sampsize)
heights,bins = np.histogram(data,bins=40)
binCenters = (bins[:-1]+bins[1:]) / 2
estPdf = heights / np.max(heights)


## make the plot
plt.figure(figsize=(6,4))
plt.plot(xx,pdf,'--',color=[.7,.7,.7],label='Analytic pdf')
plt.plot(binCenters,estPdf,'k',label='Empirical pdf')

plt.legend()
plt.gca().set(xlim=xx[[0,-1]],xlabel='x',ylabel='Probability (norm.)',title=r'$\bf{B}$)  Pareto pdf')

plt.tight_layout()
plt.show()


## calculate moments and print table
momentsData = (np.mean(data),np.var(data),stats.skew(data),stats.kurtosis(data))
momentsPdf = stats.pareto.stats(b=5,moments='mvsk')

display(Javascript('''for (rule of document.styleSheets[0].cssRules){if (rule.selectorText=='body') {rule.style.fontSize = '17px'; break}}'''))
print(f'Source:    Mean   :    Var.   :    Skew   :  Kurtosis')
print(f'-----------------------------------------------------')
print(f'  Data:   {momentsData[0]:5.2f}   :   {momentsData[1]:5.2f}   :   {momentsData[2]:5.2f}   :   {momentsData[3]:5.2f}')
print(f'  pdf :   {momentsPdf[0]:5.2f}   :   {momentsPdf[1]:5.2f}   :   {momentsPdf[2]:5.2f}   :   {momentsPdf[3]:5.2f}')

# Exercise 18.6: areas

In [None]:
# area calculations
z1 = 1
z2 = 2

p_z1z2 = stats.norm.cdf(z2) - stats.norm.cdf(z1)
p_z2inf = 1 - stats.norm.cdf(z2)

print(f'Area between z = {z1} and z = {z2}: {p_z1z2*100:.2f}%')
print(f'Area between z = {z2} and z = oo: {p_z2inf*100:.2f}%')

In [None]:
# create the pdf
zz = np.linspace(-4,4,305)
pdf = stats.norm.pdf(zz) * (zz[1]-zz[0])

# draw the figure
_,axs = plt.subplots(1,figsize=(10,3.5))

axs.plot(zz,pdf,'k',label='pdf')
axs.axvline(z1,color='k',linestyle='--',label=fr'$z_1$ = {z1}')
axs.axvline(z2,color=[.7,.7,.7],linestyle=':',label=fr'$z_2$ = {z2}')
axs.fill_between(zz[(zz>=z1) & (zz<=z2)],pdf[(zz>=z1) & (zz<=z2)],color='k',alpha=.2,label=r'$A([z_1,z_2]) = %.2f \%% $' %(p_z1z2*100))
axs.fill_between(zz[zz>=z2],pdf[zz>=z2],color='k',alpha=.5,label=r'$A([z_2,\infty)) = %.2f \%% $' %(p_z2inf*100))
axs.legend()
axs.set(xlim=zz[[0,-1]],xlabel='z',ylim=[0,1.1*np.max(pdf)],ylabel='Probability density')

plt.tight_layout()
plt.savefig('stats_ex6.png')
plt.show()