|<h2>Substack post:</h2>|<h1><a href="https://asdf" target="_blank">Confidence intervals (parts 1, 2, and 3)</a></h1>|
|-|:-:|
|<h2>Teacher:<h2>|<h1>Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h1>|

<br>

<i>Using the code without reading the post may lead to confusion or errors.</i>

In [None]:
# import libraries and define global settings
import numpy as np
import scipy.stats as stats
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
### Run this cell only if you're using "dark mode"

# svg plots (higher-res)
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

plt.rcParams.update({
    'figure.facecolor': '#171717',
    'figure.edgecolor': '#171717',
    'axes.facecolor':   '#171717',
    'axes.edgecolor':   '#DDE2F4',
    'axes.labelcolor':  '#DDE2F4',
    'xtick.color':      '#DDE2F4',
    'ytick.color':      '#DDE2F4',
    'text.color':       '#DDE2F4',
    'axes.spines.right': False,
    'axes.spines.top':   False,
    'axes.titleweight': 'bold',
    'axes.labelweight': 'bold',
})

<h1>The code below is for <b>part 1</b> (interpretation and equation)</h1>
<h3>Scroll down for parts 2 and 3</h3>


In [None]:
# the data
eSizes = [6,1,6,1]
means = [6.6,7,5,0]

# the plot
plt.figure(figsize=(4,3))
plt.errorbar(range(4),means,eSizes,marker='s',color=[.9,.7,.7], markerfacecolor=(.9,.7,.9),capsize=5,linestyle='None')
plt.axhline(y=0,color=(.7,.7,.7),linestyle='--',zorder=-1)
plt.gca().set(xticks=range(4),xticklabels=['A','B','C','D'],xlim=[-.5,3.5],
              xlabel='Condition',ylabel='Data value')

plt.tight_layout()
plt.show()

# Analytic confidence interval

In [None]:
conflevel = .95
n = 20
tStar = stats.t.isf((1-conflevel)/2,n-1)
print(tStar)

In [None]:
# simulation parameters
mean = 2.3
stdev = 3.2
N = 48
conflevel = .95

# confidence interval from formula
tStar = stats.t.isf((1-conflevel)/2,N-1)
conf_int_me = [ mean - tStar*(stdev/np.sqrt(N)), \
                mean + tStar*(stdev/np.sqrt(N)) ]

# confidence interval from scipy
conf_int_sp = stats.t.interval(conflevel,N-1,
                               loc=mean,scale=stdev/np.sqrt(N))

print(conf_int_me)
print(conf_int_sp)

# Confidence interval vs. standard deviation

In [None]:
# parameters
sampleSizes = [50,2000]
confLevel = .95

_,axs = plt.subplots(2,1,figsize=(8,5))


# loop over the two sample sizes
for ax,N,t in zip(axs,sampleSizes,['A','B']):

  # generate a random sample of size N
  data = np.random.normal(0,2,N)

  # mean and standard deviation
  mean = np.mean(data)
  stdev = np.std(data,ddof=1)

  # Calculate 95% confidence interval
  stderr = stdev / np.sqrt(len(data))
  conf_interval = stats.t.interval(confLevel, N-1, loc=mean, scale=stderr)




  # Plot the histogram
  ax.hist(data,bins='fd',color=[.7,.7,.9,.3])

  # Plot the mean
  ax.axvline(mean,color='w',linewidth=3,label='Mean')

  # one standard deviation of the mean
  ax.axvline(mean-stdev,linestyle=':',color=(.7,.9,.7),linewidth=2,label='1 std')
  ax.axvline(mean+stdev,linestyle=':',color=(.7,.9,.7),linewidth=2)

  # Plot the confidence interval
  ax.axvline(conf_interval[0],linestyle='--',color=(.9,.3,.3),linewidth=2,label='95% CI')
  ax.axvline(conf_interval[1],linestyle='--',color=(.9,.3,.3),linewidth=2)

  ax.set(xlim=[-6,6],ylabel='Count',title=f'Sample size = {N}')
  ax.legend()

axs[-1].set_xlabel('Data value')

plt.tight_layout()
plt.savefig('ci.png',dpi=300)
plt.show()

In [None]:
# a range of sample sizes
sampleSizes = np.arange(5,2000,42)

# initialize output matrix
results = np.zeros((2,len(sampleSizes)))
confLevel = .95

# loop over all sample sizes
for idx,N in enumerate(sampleSizes):

  # Generate a random sample of size N
  data = np.random.randn(N)*2

  # mean and standard deviation
  stdev = np.std(data,ddof=1)

  # Calculate 95% confidence interval
  stderr = stdev / np.sqrt(len(data))
  conf_interval = stats.t.interval(confLevel, N-1, loc=np.mean(data), scale=stderr)

  results[0,idx] = 2*stdev
  results[1,idx] = conf_interval[1]-conf_interval[0]


# visualize!
plt.figure(figsize=(10,4))
plt.plot(sampleSizes,results[0,:],'wo-',linewidth=.4,markerfacecolor=[.7,.9,.7],markersize=8,label='Standard deviation')
plt.plot(sampleSizes,results[1,:],'ws-',linewidth=.4,markerfacecolor=[.9,.7,.7],markersize=8,label='Confidence interval')
plt.gca().set(xlabel='Sample size',ylabel='Stdev or C.I. (log)',yscale='log')

plt.legend()
plt.tight_layout()
plt.savefig('ci.png',dpi=300)
plt.show()

<h1>The code below is for <b>part 2</b> (bootstrapping methods)</h1>
<h3>Keep scrolling for part 3</h3>


In [None]:
# intro to resampling

S = [1,2,3,4]

print('    Sample    |  Mean')
print('----------------------')
print(f'{S}  |  {np.mean(S):.2f}')

for i in range(5):

  # bootstrap a random sample
  b = np.random.choice(S,len(S),replace=True)
  # note: replace=True is the default setting; I set it here to emphasize its importance.

  # and print it and its mean
  print(f'{np.sort(b).tolist()}  |  {np.mean(b):.2f}')

In [None]:
# demo in a larger dataset

# parameters
samplesize = 500
mu = 0
sig = .2

# data
dataset = np.random.lognormal(mean=mu,sigma=sig,size=samplesize)
expMean = np.exp(mu+sig**2/2) # expected average

# we'll need these statistics later
samplemean = np.mean(dataset)
samplestd  = np.std(dataset,ddof=1)

# histogram
plt.figure(figsize=(10,3))
plt.hist(dataset,bins='fd',color=[.7,.7,.9],edgecolor='k',label='Data histogram')
plt.axvline(samplemean,color='g',linewidth=3,label='Sample mean')
plt.axvline(expMean,color='r',linestyle='--',linewidth=3,label='Expected mean')

plt.gca().set(xlabel='Data value',ylabel='Count',title=f'Data histogram (N = {samplesize})')

plt.legend()
plt.show()

In [None]:
# number of resamples
numBoots = 1000

# initialize a vector to store the bootstrapped means
bootmeans = np.zeros(numBoots)

## now for bootstrapping
for booti in range(numBoots):

  # create a bootstrap sample
  bootsample = np.random.choice(dataset,samplesize)

  # and compute its mean
  bootmeans[booti] = np.mean(bootsample)


# Coding note: I used a multi-line for-loop above for procedural clarity. A list-comprehension is more compact:
#bootmeans = [np.mean(np.random.choice(dataset,samplesize)) for booti in range(numBoots)]


# find confidence intervals (hard-coded to 95%)
confintB = np.percentile(bootmeans,[2.5,97.5])
confintB

In [None]:
# graph everything
fig,axs = plt.subplots(1,2,figsize=(10,3))

for a in axs:

  # the histogram
  if a is axs[0]:
    h = a.hist(dataset,bins='fd',color=[.7,.7,.9,.3],label='Data histogram')
  a.hist(bootmeans,bins='fd',color=[.7,.3,.7,.5],label='Bootstrap means')
  ytop = np.max(h[0]) # convenient variable for histogram peak value

  # confidence interval area
  a.fill_between([confintB[0],confintB[1]],[0,0],[ytop,ytop],color=[.7,.9,.7,.2],label=f'{95}% CI region')

  # lines indicating population means
  a.plot([expMean,expMean],[0,ytop*1.1],'r--',linewidth=2,label='Expected mean')
  a.plot([samplemean,samplemean],[0,ytop],'g',linewidth=2,label='Sample mean')

  # some more adjustments
  a.legend()
  a.set(yticks=[],xlabel='Data values',ylabel='Count (a.u.)')


axs[1].set_xlim([confintB[0]-np.diff(confintB),confintB[1]+np.diff(confintB)])
plt.tight_layout()
plt.show()

In [None]:
## compare against the analytic confidence interval

# compute confidence intervals (again, hard-coding to 95%)
confintA = stats.t.interval(.975,samplesize-1,
                           loc=samplemean,scale=samplestd/np.sqrt(samplesize))

print(f'Empirical CI(95%) = ({confintB[0]:.3f},{confintB[1]:.3f})')
print(f'Analytic  CI(95%) = ({confintA[0]:.3f},{confintA[1]:.3f})')

In [None]:
# impact of the number of resamples on the CI widths
bootstrap_sizes = np.logspace(np.log10(10),np.log10(10_000),100).astype(int)

ci_by_bootSize = np.zeros(len(bootstrap_sizes))

# run the experiment
for i in range(len(bootstrap_sizes)):
  numBoots = bootstrap_sizes[i]
  bootmeans = [ np.mean(np.random.choice(dataset,samplesize)) for booti in range(numBoots) ]
  ci_by_bootSize[i] = np.diff( np.percentile(bootmeans,[2.5,97.5]) )[0]

In [None]:
plt.figure(figsize=(10,3))

color = abs(ci_by_bootSize - ci_by_bootSize[-1])
color = (color-color.min())/(color.max()-color.min())

plt.scatter(bootstrap_sizes,ci_by_bootSize,s=80,edgecolor='w',alpha=.7,
            linewidth=.3,marker='h',c=mpl.cm.plasma_r(color))
plt.axhline(y=ci_by_bootSize[-1],color=(.7,.7,.7),linestyle='--',zorder=-1,linewidth=.5)

plt.gca().set(xlabel='Number of bootstrap samples',ylabel='Confidence interval width',xscale='log',
              title='Impact of bootstrap size on confidence interval')

plt.show()

### Empirical confidence interval bounds on the standard deviation

In [None]:
numBoots = 1000

samplestd = np.std(dataset,ddof=1)

# using list comprehension
bootstds = [ np.std(np.random.choice(dataset,samplesize),ddof=1) for booti in range(numBoots) ]

# find confidence intervals (hard-coded to 95%)
confintB = np.percentile(bootstds,[2.5,97.5])


# graph everything
fig,ax = plt.subplots(1,figsize=(8,4))

ax.hist(bootstds,bins='fd',color=[.7,.3,.7,.5],label='Bootstrap stdevs')
ytop = np.max(h[0]) # convenient variable for histogram peak value

# confidence interval area
ax.fill_between([confintB[0],confintB[1]],[0,0],[ytop,ytop],color=[.7,.9,.7,.2],label=f'{95}% CI region')

# lines indicating population means
ax.plot([samplestd,samplestd],[0,ytop],'g',linewidth=2,label='Sample stdev.')

# some more adjustments
ax.legend()
ax.set(yticks=[],xlabel='Standard deviation values',ylabel='Count (a.u.)',
       xlim=[confintB[0]-np.diff(confintB),confintB[1]+np.diff(confintB)])

plt.tight_layout()
plt.show()

<h1>The code below is for <b>part 3</b> (explorations in real data)</h1>

In [None]:
# dataset reference: https://archive.ics.uci.edu/ml/datasets/Arrhythmia

# import data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
df = pd.read_csv(url,usecols = np.arange(5),
                 names = ['age','sex','height','weight','qrs'])

# inspect
df.describe()

In [None]:
# 1) make a copy of the original data matrix
df_z = df.copy()

# 2) z-score the data
for col in df_z.columns:
  if not (col=='sex'):
    df_z[col] = (df[col] - df[col].mean()) / df[col].std(ddof=1)

# 3) remove extreme values in the original data
zThresh = 3.29 # p<.001
df_clean = df.copy()
df_clean[abs(df_z)>zThresh] = np.nan  # both tails

In [None]:
_,axs = plt.subplots(1,2,figsize=(12,3))
sns.boxplot(data=df,linecolor='w',linewidth=.5,ax=axs[0],
            flierprops={'markerfacecolor':'m','markeredgecolor':'w'},)
axs[0].set(xlabel='Variable',ylabel='Value',title='Box plots of raw data')

sns.boxplot(data=df_clean,linecolor='w',linewidth=.5,ax=axs[1],
            flierprops={'markerfacecolor':'m','markeredgecolor':'w'},)
axs[1].set(xlabel='Variable',ylabel='Value',title='Box plots of cleaned data')

plt.tight_layout()
plt.show()

In [None]:
# the methods mean(), std(), and count() in Pandas exclude NaN's.

confints = np.zeros((len(df.columns),2))

for i,col in enumerate(df.columns):

  # original data
  mean = df[col].mean()
  std  = df[col].std(ddof=1)
  n    = df[col].count()
  ci   = stats.t.ppf(.975,n-1) * std/np.sqrt(n)
  print(f'{col:>6} initial: {mean:6.2f} +/- {ci:.2f}')
  confints[i,0] = ci

  # repeat for cleaned
  mean = df_clean[col].mean()
  std  = df_clean[col].std(ddof=1)
  n    = df_clean[col].count()
  ci   = stats.t.ppf(.975,n-1) * std/np.sqrt(n)
  print(f'{col:>6} cleaned: {mean:6.2f} +/- {ci:.2f}\n')
  confints[i,1] = ci

In [None]:
plt.figure(figsize=(5,3))

for i in range(len(confints)):
  plt.plot([i,i],[confints[i,0],confints[i,1]],'w--',linewidth=.5)

plt.plot(confints[:,0],'ks',markerfacecolor=[.9,.7,.7],markersize=12,label='Pre-clean')
plt.plot(confints[:,1],'ko',markerfacecolor=[.7,.7,.9],markersize=12,label='Post-clean')

plt.gca().set(xticks=np.arange(len(confints)),xticklabels=df.columns,
              xlabel='Variable',ylabel='Confidence interval',title='Impact of cleaning on confidence interval')

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# sample correlation in the original data
observedR = df[['weight','qrs']].corr().values[0,1]

# scatter plot
plt.figure(figsize=(6,3))
plt.plot(df['weight'],df['qrs'],'wh',markerfacecolor=[.7,.9,.7,.5])
plt.gca().set(xlabel='Weight',ylabel='QRS duration (ms)',
              title=f'Correlation = {observedR:.3f}')

plt.show()

In [None]:
# initialize a vector to store the bootstrapped means
bootrs = np.zeros(numBoots)

## now for bootstrapping
for booti in range(numBoots):

  # create a bootstrap sample
  bootsample = df.sample(n=len(df),replace=True)

  # and compute the correlation
  bootrs[booti] = bootsample[['weight','qrs']].corr().values[0,1]


# calculate the 95% confidence intervals
confintB = np.percentile(bootrs,[2.5,97.5])
confintB

In [None]:
_,ax = plt.subplots(1,figsize=(6,3))

h = ax.hist(bootrs,bins='fd',color=[.7,.3,.7,.5],label='Bootstrap $r$')
ytop = np.max(h[0])

# confidence interval area
ax.fill_between([confintB[0],confintB[1]],[0,0],[ytop,ytop],color=[.7,.9,.7,.2],label=f'{95}% CI region')

# lines indicating population means
ax.plot([0,0],[0,ytop*1.1],'r:',linewidth=2,label='H$_0$ value')
ax.plot([observedR,observedR],[0,ytop],'w--',linewidth=2,label='Observed correlation')

# some more adjustments
ax.legend(fontsize=9)
ax.set(yticks=[],xlabel='Data values',ylabel='Count (a.u.)',xlim=[-.3,.5])
plt.tight_layout()
plt.show()