# Uncertainty

## Imports

In [1]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


from scipy import stats

In [2]:
df_happiness = (
    pd.read_csv('https://tinyurl.com/worldhappiness2018')
    .dropna()
    .rename(columns = {'happiness_score': 'happiness'})
    .filter(regex = '_sc|country|happ')
)

## Frequentist

In [3]:
model = smf.ols(
    'happiness ~ life_exp_sc + corrupt_sc + gdp_pc_sc',
    data = df_happiness
).fit()

model.conf_int()

model.get_prediction().summary_frame() # both 'confidence' and 'prediction' intervals

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,3.987671,0.132758,3.724521,4.250820,2.736966,5.238375
1,5.496638,0.104065,5.290363,5.702914,4.256653,6.736624
2,5.676520,0.087470,5.503139,5.849901,4.441580,6.911459
3,5.406585,0.107000,5.194492,5.618678,4.165618,6.647552
4,6.966640,0.126756,6.715389,7.217892,5.718385,8.214896
...,...,...,...,...,...,...
107,5.861256,0.077897,5.706850,6.015661,4.628837,7.093674
108,5.290368,0.147161,4.998669,5.582067,4.033347,6.547389
109,5.327998,0.083659,5.162170,5.493825,4.094096,6.561899
110,4.308105,0.101039,4.107828,4.508383,3.069103,5.547107


In [4]:
model.get_prediction().summary_frame()

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,3.987671,0.132758,3.724521,4.250820,2.736966,5.238375
1,5.496638,0.104065,5.290363,5.702914,4.256653,6.736624
2,5.676520,0.087470,5.503139,5.849901,4.441580,6.911459
3,5.406585,0.107000,5.194492,5.618678,4.165618,6.647552
4,6.966640,0.126756,6.715389,7.217892,5.718385,8.214896
...,...,...,...,...,...,...
107,5.861256,0.077897,5.706850,6.015661,4.628837,7.093674
108,5.290368,0.147161,4.998669,5.582067,4.033347,6.547389
109,5.327998,0.083659,5.162170,5.493825,4.094096,6.561899
110,4.308105,0.101039,4.107828,4.508383,3.069103,5.547107


By hand approach

In [5]:
X = model.model.exog

# get the prediction
y_hat = X @ model.params

# get the standard error
se = np.sqrt(np.diag(X @ model.cov_params() @ X.T))

# critical value for 95% confidence
cv = stats.t.ppf(0.975, model.df_resid)

# get the interval
pd.DataFrame({
    'prediction': y_hat,
    'lower': y_hat - cv * se,
    'upper': y_hat + cv * se
}).head()

Unnamed: 0,prediction,lower,upper
0,3.987671,3.724521,4.25082
1,5.496638,5.290363,5.702914
2,5.67652,5.503139,5.849901
3,5.406585,5.194492,5.618678
4,6.96664,6.715389,7.217892


In [6]:
# get the prediction interval
se_pred = np.sqrt(se**2 + model.mse_resid)

pd.DataFrame({
    'prediction': y_hat,
    'lower': y_hat - cv * se_pred,
    'upper': y_hat + cv * se_pred
}).head()

Unnamed: 0,prediction,lower,upper
0,3.987671,2.736966,5.238375
1,5.496638,4.256653,6.736624
2,5.67652,4.44158,6.911459
3,5.406585,4.165618,6.647552
4,6.96664,5.718385,8.214896


In [7]:
model.get_prediction().summary_frame().head()

Unnamed: 0,mean,mean_se,mean_ci_lower,mean_ci_upper,obs_ci_lower,obs_ci_upper
0,3.987671,0.132758,3.724521,4.25082,2.736966,5.238375
1,5.496638,0.104065,5.290363,5.702914,4.256653,6.736624
2,5.67652,0.08747,5.503139,5.849901,4.44158,6.911459
3,5.406585,0.107,5.194492,5.618678,4.165618,6.647552
4,6.96664,0.126756,6.715389,7.217892,5.718385,8.214896


## Monte Carlo

In [8]:
# we'll use the model from the previous section
model = smf.ols(
    'happiness ~ life_exp_sc + corrupt_sc + gdp_pc_sc',
    data = df_happiness
).fit()

def mc_predictions(model, nsim=2500, seed=42):
    np.random.seed(seed)

    params_est = model.params
    params = np.random.multivariate_normal(
        mean = params_est,
        cov = model.cov_params(),
        size = nsim
    )

    sigma = model.mse_resid**.5
    X = model.model.exog

    y_hat = X @ params.T + np.random.normal(scale = sigma, size = (X.shape[0], nsim))

    pred_int = np.quantile(y_hat, q = [.025, .975], axis = 1)

    return pred_int

our_mc = mc_predictions(model)

In [9]:
# Statsmodels Prediction Intervals
prediction_intervals = model.get_prediction().summary_frame()
statsmodels_lower = prediction_intervals['obs_ci_lower']
statsmodels_upper = prediction_intervals['obs_ci_upper']


pd.DataFrame({
    'observed_value': df_happiness['happiness'],
    'prediction': model.fittedvalues,
    'simulated_lower': our_mc[0],
    'simulated_upper': our_mc[1],
    'statsmodels_lower': statsmodels_lower,
    'statsmodels_upper': statsmodels_upper
}).round(3)

Unnamed: 0,observed_value,prediction,simulated_lower,simulated_upper,statsmodels_lower,statsmodels_upper
0,3.632,3.988,2.770,5.197,2.737,5.238
1,4.586,5.497,4.278,6.759,4.257,6.737
2,6.388,5.677,4.451,6.889,4.442,6.911
3,4.321,5.407,4.218,6.666,4.166,6.648
4,7.272,6.967,5.733,8.167,5.718,8.215
...,...,...,...,...,...,...
107,6.379,5.861,4.670,7.090,4.629,7.094
108,6.096,5.290,4.057,6.547,4.033,6.547
109,4.806,5.328,4.126,6.530,4.094,6.562
110,4.377,4.308,3.097,5.531,3.069,5.547


## Bootstrap

In [10]:
def bootstrap(X, y, nboot=100, seed=123):
    # add a column of 1s for the intercept
    X = np.c_[np.ones(X.shape[0]), X]
    N = X.shape[0]

    # initialize
    beta = np.empty((nboot, X.shape[1]))
    
    # beta = pd.DataFrame(beta, columns=['Intercept'] + list(cn))
    mse = np.empty(nboot)    

    # set seed
    np.random.seed(seed)

    for i in range(nboot):
        # sample with replacement
        idx = np.random.randint(0, N, N)
        Xi = X[idx, :]
        yi = y[idx]

        # estimate model
        model = LinearRegression(fit_intercept=False)
        mod = model.fit(Xi, yi)

        # save results
        beta[i, :] = mod.coef_
        mse[i] = np.sum((mod.predict(Xi) - yi)**2) / N

    # given mean estimates, calculate MSE
    y_hat = X @ beta.mean(axis=0)
    final_mse = np.sum((y - y_hat)**2) / N

    output = {
        'par': beta,
        'mse': mse,
        'final_mse': final_mse
    }

    return output

our_boot = bootstrap(
    X = df_happiness[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']],
    y = df_happiness['happiness'],
    nboot = 1000
)

In [11]:
np.percentile(our_boot['par'], 2.5, axis=0)

array([ 5.34092479,  0.27665819, -0.29543964,  0.19114177])

In [12]:
pd.DataFrame({
    'param': ['Intercept', 'life_exp_sc', 'corrupt_sc', 'gdp_pc_sc'],
    'mean': our_boot['par'].mean(axis=0),
    'lwr': np.percentile(our_boot['par'], 2.5, axis=0),
    'upr': np.percentile(our_boot['par'], 97.5, axis=0)
})

Unnamed: 0,param,mean,lwr,upr
0,Intercept,5.451703,5.340925,5.572782
1,life_exp_sc,0.511917,0.276658,0.754842
2,corrupt_sc,-0.106482,-0.29544,0.080125
3,gdp_pc_sc,0.459829,0.191142,0.776553


## Bayesian

In [13]:
from scipy.stats import beta

pk = np.array([
    'goal','goal','goal','miss','miss',
    'goal','goal','miss','goal','goal'
])

# convert to numeric, arbitrarily picking goal=1, miss=0
N = len(pk)                     # sample size
n_goal = np.sum(pk == 'goal')   # number of pk made
n_miss = np.sum(pk == 'miss')   # number of those miss

# grid of potential theta values
theta = np.linspace(1 / (N + 1), N / (N + 1), 10)

### prior distribution
# beta prior with mean = .5, but fairly diffuse
# examine the prior
# theta = beta.rvs(5, 5, size = 1000)
# plt.hist(theta, bins = 20, color = 'lightblue')
p_theta = beta.pdf(theta, 5, 5)

# Normalize so that values sum to 1
p_theta = p_theta / np.sum(p_theta)

# likelihood (binomial)
p_data_given_theta = np.math.comb(N, n_goal) * theta**n_goal * (1 - theta)**n_miss

# posterior (combination of prior and likelihood)
# marginal probability of the data used for normalization
p_data = np.sum(p_data_given_theta * p_theta) 

p_theta_given_data = p_data_given_theta * p_theta / p_data  # Bayes theorem

# final estimate
theta_est = np.sum(theta * p_theta_given_data)
theta_est

  p_data_given_theta = np.math.comb(N, n_goal) * theta**n_goal * (1 - theta)**n_miss


0.599999996503221

## Conformal

In [14]:
def split_conformal(X, y, new_data, alpha = .05, calibration_split = .5):
    # Splitting the data into training and calibration sets
    X_train, X_cal, y_train, y_cal = train_test_split(
        X, 
        y,
        test_size = calibration_split,
        random_state = 123
    )

    N = X_train.shape[0]

    # Train the base model
    model = LinearRegression().fit(X_train, y_train)

    # Calculate residuals on calibration set
    cal_preds = model.predict(X_cal)
    residuals = np.abs(y_cal - cal_preds)

    # Sort residuals and find the quantile corresponding to (1-alpha)
    residuals = np.sort(residuals)

    # The correction here is useful for small sample sizes
    quantile  = np.quantile(residuals, (1 - alpha) * (N / (N + 1)))

    # Make predictions on new data and calculate prediction intervals
    preds = model.predict(new_data)
    lower_bounds = preds - quantile
    upper_bounds = preds + quantile

    # Return predictions and prediction intervals
    return {
        'cp_error': quantile,
        'preds': preds,
        'lower_bounds': lower_bounds,
        'upper_bounds': upper_bounds
    }


In [34]:
# split data
from sklearn.model_selection import train_test_split

X = df_happiness[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']]
y = df_happiness['happiness']

X_train, X_test, y_train, y_test = train_test_split(
    df_happiness[['life_exp_sc', 'corrupt_sc', 'gdp_pc_sc']],
    df_happiness['happiness'],
    test_size = 0.5,
    random_state = 123
)

our_cp_error = split_conformal(
    X_train,
    y_train,
    X_test,
    alpha = .1
)

print(our_cp_error['cp_error'])

pd.DataFrame({
    'preds': our_cp_error['preds'],
    'lower_bounds': our_cp_error['lower_bounds'],
    'upper_bounds': our_cp_error['upper_bounds']
}).head()

1.0366496877019815


Unnamed: 0,preds,lower_bounds,upper_bounds
0,6.217809,5.181159,7.254459
1,4.269099,3.23245,5.305749
2,4.7596,3.72295,5.796249
3,5.227053,4.190403,6.263703
4,4.287107,3.250457,5.323756


In [35]:
from mapie.regression import MapieRegressor

model = MapieRegressor(LinearRegression(), method = 'base', random_state=123)
y_pred, y_pis = model.fit(X_train, y_train).predict(X_test, alpha = 0.1)

# take the first difference between upper and lower bounds,
# since it's constant for all predictions in this setting

cp_error = (y_pis[0, 1, 0] - y_pis[0, 0, 0]) / 2  

In [36]:
y_pis[:5].reshape(-1, 2)

array([[5.39702408, 7.54753509],
       [3.24052412, 5.39103513],
       [3.65681605, 5.80732707],
       [4.28406556, 6.43457658],
       [3.24132564, 5.39183665]])

In [37]:
our_cp_error['cp_error'], cp_error

(1.0366496877019815, 1.075255506898503)