In [9]:
import numpy as np
import pandas as pd
import scipy.stats as st

np.random.seed(42)

def corr(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    x_mu = x - x.mean()
    y_mu = y - y.mean()

    return np.dot(x_mu, y_mu)/np.sqrt( np.sum(x_mu**2) * np.sum(y_mu**2) )

def corr2(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    # Rowwise mean of input arrays & subtract from input arrays themeselves
    A_mu = A - A.mean(1)[:, None]
    B_mu = B - B.mean(1)[:, None]

    # Sum of squares across rows
    ssA = np.einsum('ij,ij->i', A_mu, A_mu)
    ssB = np.einsum('ij,ij->i', B_mu, B_mu)

    return np.einsum('ij,ij->i', A_mu, B_mu) / np.sqrt(ssA*ssB)

def cis(boot: np.ndarray, theta_hat: float, se_boot: float, alpha: float = 0.05):
    z = st.norm.ppf(1 - alpha/2)

    quantile_low = np.quantile(boot, alpha/2)
    quantile_top = np.quantile(boot, 1 - alpha/2)

    # Normal CI
    normal_ci = (theta_hat - z*se_boot, theta_hat + z*se_boot)

    # Percentile CI
    percentile_ci = (quantile_low, quantile_top)

    # Pivotal CI
    pivotal_ci = (2*theta_hat - quantile_top, 2*theta_hat - quantile_low)

    return normal_ci, percentile_ci, pivotal_ci

data = pd.read_csv('lsat-gpa.csv')

X = data['LSAT'].to_numpy()
Y = data['GPA'].to_numpy()

# Estimated correlation
theta_hat = corr(X, Y)
print(theta_hat)

# Bootstrap
B = 1000
n = len(X)

idx = np.random.choice(np.arange(n), size=(B, n), replace=True)
xx = X[idx]
yy = Y[idx]

theta_boot = corr2(xx, yy)

# Estimated standard error
se_boot = theta_boot.std()
print(se_boot)

# Confidence intervals
for ci in cis(theta_boot, theta_hat, se_boot):
    print(ci)

0.5459189161795885
0.19260145776428217
(0.16842699559168312, 0.923410836767494)
(0.19331875643839896, 0.9387076980820954)
(0.15313013427708166, 0.8985190759207782)
