# <center>Class 11: Modelling Probabilities</center>

In [None]:
import os
import sys
import warnings
from typing import List
import copy

import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import math

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss

from patsy import dmatrices
from stargazer.stargazer import Stargazer
from utils import lspline

import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline

## Data - Health

In [None]:
path = os.path.join(os.pardir, 'data', 'share-health.csv') # this will produce a path with the right syntax for your operating system
path

In [None]:
df_health = pd.read_csv(path)

In [None]:
df_health.info()

In [None]:
df_health.iloc[0:6]

In [None]:
df_health.eduyears_mod.describe()

#### Feature Engineering & EDA

In [None]:
df_health["healthy"] = 0
df_health.loc[(df_health["sphus"] == 1) | (df_health["sphus"] == 2), "healthy"] = 1
df_health.loc[~((df_health["sphus"] > 0) & (df_health["sphus"] <= 5)), "healthy"] = np.nan

In [None]:
df_health.isna().sum()

In [None]:
df_health.dropna(inplace= True)

In [None]:
df_health.wave.value_counts().sort_index()

In [None]:
df_health["baseline"] = 0
df_health.loc[df_health["wave"] == 4, "baseline"] = 1
df_health["endline"] = 0
df_health.loc[df_health["wave"] == 6, "endline"] = 1

In [None]:
df_health.baseline.value_counts()

In [None]:
df_health.endline.value_counts()

In [None]:
df_health["temp"] = np.where(
    df_health["endline"] == 1, np.where(df_health["healthy"] == 1, 1, 0), np.nan
)
df_health["stayshealthy"] = df_health.groupby("mergeid")["temp"].transform(np.nanmax)
df_health = df_health.drop("temp", axis=1)

In [None]:
df_health.shape

In [None]:
df_health.stayshealthy.value_counts()

In [None]:
# keep if endline health outcome non-missing
df_health = df_health.loc[lambda x: (x["stayshealthy"] == 1) | (x["stayshealthy"] == 0)]

In [None]:
# keep baseline observations (endline outcome already defined for them)
df_health = df_health.loc[lambda x: x["baseline"] == 1]

In [None]:
# keep age 50-60 at baseline
df_health = df_health.loc[lambda x: (x["age"] >= 50) & (x["age"] <= 60)]

In [None]:
# keep healthy individuals at baseline
df_health = df_health.loc[lambda x: x["healthy"] == 1]

In [None]:
# keep those with non-missing observations for smoking at baseline
# and re-define smoking to be 0-1
df_health.loc[lambda x: x["smoking"] == 5, "smoking"] = 0
df_health = df_health.loc[lambda x: (x["smoking"] == 0) | (x["smoking"] == 1)]

df_health.loc[lambda x: x["ever_smoked"] == 5, "ever_smoked"] = 0
df_health = df_health.loc[lambda x: (x["ever_smoked"] == 0) | (x["ever_smoked"] == 1)]

In [None]:
df_health["exerc"] = np.where(
    df_health["br015"] == 1,
    1,
    np.where((df_health["br015"] > 0) & (df_health["br015"] != 1), 0, np.nan),
)
df_health["exerc"].value_counts()

In [None]:
df_health["bmi"] = np.where(df_health["bmi"] < 0, np.nan, df_health["bmi"])

df_health["bmi"].describe().round(2)

In [None]:
df_health = df_health.rename(columns={"income_pct_w4": "income10"})
df_health["married"] = np.where((df_health["mar_stat"] == 1) | (df_health["mar_stat"] == 2), 1, 0)
df_health["eduyears"] = np.where(df_health["eduyears_mod"] < 0, np.nan, df_health["eduyears_mod"])
df_health = df_health.drop("eduyears_mod", axis=1)

df_health["eduyears"].describe().round(2)

In [None]:
df_health = df_health[(df_health["bmi"].notnull()) & (df_health["eduyears"].notnull()) & (df_health["exerc"].notnull())]

In [None]:
df_health.info()

In [None]:
df_health[[
        "stayshealthy",
        "smoking",
        "ever_smoked",
        "female",
        "age",
        "income10",
        "eduyears",
        "bmi",
        "exerc",
    ]].describe().T.round(3)

In [None]:
df_health.income10.value_counts().sort_index()

**Question**: What kind if metric is this above?

In [None]:
df_health.income10.value_counts().sort_index().plot(kind = 'bar', xlabel = 'income deciles', ylabel = 'frequency');

In [None]:
pd.crosstab(df_health["country"], df_health["stayshealthy"])

## Simple Linear Probablity Models (LPM)

**smokers vs non-smokers**

In [None]:
lpm1 = smf.ols("stayshealthy ~ smoking", data=df_health).fit(cov_type="HC1")
lpm2 = smf.ols("stayshealthy ~ smoking + ever_smoked", data=df_health).fit(cov_type="HC1")

In [None]:
df_health["pred_lpmbase"] = lpm1.predict() # we will need it for comparison

In [None]:
stargazer = Stargazer([lpm1, lpm2])
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer

In [None]:
df_health["pred1"] = lpm1.predict()

pd.crosstab(index=df_health["pred1"], columns=df_health["smoking"])

**Question**: Why do we have only two kinds of predicted values?

In [None]:
pd.crosstab(index=df_health["stayshealthy"], columns=df_health["smoking"], margins = True)

In [None]:
sns.regplot(data = df_health, x = 'smoking', y = 'stayshealthy', ci = None, color = 'k')
plt.grid(linestyle = 'dotted')
plt.xticks([0 ,1])
plt.xlabel('current smoker')
plt.ylabel('probablity of staying healthy');

**Education and income - non-parametric regressions**

In [None]:
df_health.groupby(["eduyears", "stayshealthy"])["smoking"].transform(len)

In [None]:
df_health['weight'] = df_health.groupby(["eduyears", "stayshealthy"])["smoking"].transform(len)

In [None]:
df_health[["eduyears", "stayshealthy", "smoking", "weight"]][df_health.eduyears == 13]

In [None]:
sns.regplot(
    data = df_health, x = 'eduyears', y = 'stayshealthy', 
    lowess = True, ci = None, scatter = False, color = 'k')
plt.ylim(0,1)
plt.grid(linestyle = 'dotted')
plt.xlabel('years in education')
plt.ylabel('probablity of staying healthy');

In [None]:
sns.regplot(
    data = df_health, x = 'income10', y = 'stayshealthy', 
    lowess = True, ci = None, scatter = False, color = 'k')
plt.ylim(0,1)
plt.xticks(range(1,11))
plt.grid(linestyle = 'dotted')
plt.xlabel('income decile')
plt.ylabel('probablity of staying healthy');

In [None]:
sns.regplot(
    data = df_health, x = 'bmi', y = 'stayshealthy', 
    lowess = True, ci = None, scatter = False, color = 'k')
plt.ylim(0,1)
plt.grid(linestyle = 'dotted')
plt.xlabel('BMI')
plt.ylabel('probablity of staying healthy');

## Logit & Probit

In [None]:
df_health["country"] = df_health["country"].astype("category")

In [None]:
df_health.info()

In [None]:
df_health.iloc[0:5, 0:4]

#### Baseline LPM

In [None]:
lpm3 = smf.ols(
    "stayshealthy ~ smoking + ever_smoked + female + age + lspline(eduyears,[8,18]) + \
                    income10 + lspline(bmi,[35]) + exerc + country",
    df_health,
).fit(covtype="HC1")

In [None]:
stargazer = Stargazer([lpm3])
stargazer.covariate_order(
    [
        "smoking",
        "ever_smoked",
        "female",
        "age",
        "lspline(eduyears, [8, 18])[0]",
        "lspline(eduyears, [8, 18])[1]",
        "lspline(eduyears, [8, 18])[2]",
        "income10",
        "lspline(bmi, [35])[0]",
        "lspline(bmi, [35])[1]",
        "exerc",
    ]
)
stargazer.rename_covariates({"Intercept": "Constant"})
stargazer.add_line("Country indicators", ["Yes"])
stargazer

In [None]:
df_health["pred_lpm"] = lpm3.predict()

In [None]:
g = sns.histplot(
    data = df_health, x = 'pred_lpm', stat = 'probability', bins = [x/100 for x in range(0,102, 2)])
#aesthetics
ylabels = ['{:.0%}'.format(x) for x in g.get_yticks()] #  getting the y ticks and reformatting them as percent
g.set_yticklabels(ylabels)
xlabels = ['{:.0%}'.format(x) for x in g.get_xticks()] #  getting the x ticks and reformatting them as percent
g.set_xticklabels(xlabels)
plt.xlabel('predicted probabities')
plt.ylabel('frequency of prediction')
plt.grid(linestyle = 'dotted');

**Compare the subsamples of the lowest and highest predictions**

Discretize predictions into equal-sized buckets based on rank or based on sample quantiles. Use the `pd.qcut` [function](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html). 

In [None]:
cuts = 100
df_health["q100_pred_lpm"] = pd.qcut(df_health["pred_lpm"], q = cuts, labels = range(1, cuts + 1))

In [None]:
df_health[df_health.q100_pred_lpm == 1].shape

In [None]:
# bottom 1 percent of predictions
df_health[df_health.q100_pred_lpm == 1][["smoking", "ever_smoked", "female", "age", "eduyears", "income10", "bmi", "exerc"]].describe().T.round(2)

In [None]:
# top 1 percent of predictions
df_health[df_health.q100_pred_lpm == 100][["smoking", "ever_smoked", "female", "age", "eduyears", "income10", "bmi", "exerc"]].describe().T.round(2)

#### Logit modelling

`patsy` is a Python package for describing statistical models (especially linear models, or models that have a linear component) and building design matrices. It is closely inspired by and compatible with the formula mini-language used in R and S.

Patsy’s goal is to become the standard high-level interface to describing statistical models in Python, regardless of what particular model or library is being used underneath.

- Allows data transformations to be specified using arbitrary Python code: instead of x, we can write log(x) or (x > 0)
- Gives a range of convenient options for coding categorical variables
- Provides basic built-in transformations
- Provides a language for easy-to-read specification of linear constraints
- Features a simple API for integration into statistical packages. 

In [None]:
y, X = dmatrices(
    "stayshealthy ~ smoking + ever_smoked + female + age + lspline(eduyears,[8,18]) + \
                 income10 + lspline(bmi,[35]) + exerc + country",
    df_health,
)

`dmatrices()` constructs two design matrices given a formula_like and data. By convention, the first matrix is the “outcome” or “y” data, and the second is the “predictor” or “x” data.

In [None]:
type(X)

In [None]:
X

In [None]:
X.design_info.column_names

In [None]:
np.asarray(X)[0:5]

In [None]:
X.shape

In [None]:
y

In [None]:
np.asarray(y)

In [None]:
y.shape

In [None]:
y.ravel().shape

**type 1: Using Generalized Least Squares**

In [None]:
logit = sm.GLM(y, X, family=sm.families.Binomial(link=sm.genmod.families.links.logit()))
logit = logit.fit()

In [None]:
print(logit.summary())

In [None]:
df_health.country.value_counts().sort_index()

**type 2: Using simple Logit**

Note: the difference between the two matters if probablities are unbalanced. See this: https://stats.stackexchange.com/questions/245241/difference-between-logistic-regression-and-binomal-glm-with-logistic-link

In [None]:
logit = sm.Logit(y, X)
logit_result = logit.fit()

In [None]:
print(logit_result.summary())

In [None]:
logit_margef_results = logit_result.get_margeff()

In [None]:
print(logit_margef_results.summary())

**Question**: Which countries are meaningfully different from Austria? 

In [None]:
df_health['country_code'] = df_health.mergeid.map(lambda x: x[0:2])

In [None]:
df_health[['country', 'country_code']].drop_duplicates().sort_values(by = 'country').reset_index(drop = True)

In [None]:
df_health["pred_logit"] = logit_result.predict()

**probit**

In [None]:
probit = sm.Probit(y, X)
probit_result = probit.fit()

In [None]:
print(probit_result.summary())

In [None]:
df_health["pred_probit"] = probit_result.predict()

In [None]:
probit_margef_results = probit_result.get_margeff()
print(probit_margef_results.summary())

#### Comparing predicted probablities

In [None]:
fig, ax = plt.subplots(figsize = (6,6))

# data
ax.plot(df_health.pred_lpm, df_health.pred_lpm, color = 'k', label = 'LPM baseline')
ax.scatter(df_health.pred_lpm, df_health.pred_logit, color = 'royalblue', s = 1, label = 'logit')
ax.scatter(df_health.pred_lpm, df_health.pred_probit, color = 'indianred', s = 1, label = 'probit')

# aesthetics
ax.set_xticks(np.linspace(0,1, num = 11))
ax.set_xticklabels(['{:.0%}'.format(x) for x in np.linspace(0,1, num = 11)])
ax.set_yticks(np.linspace(0,1, num = 11))
ax.set_yticklabels(['{:.0%}'.format(y) for y in np.linspace(0,1, num = 11)])

plt.legend(labelcolor = ['k', 'royalblue', 'indianred'])
plt.grid(linestyle = 'dotted')
plt.xlabel('LPM prediction')
plt.ylabel('logit/probit prediction')
plt.title('LPM, logit, and probit regression results');

#### Distribution of predicted probabilities: a simple and an extended model

In [None]:
g = sns.histplot(data = df_health, x = 'pred_lpmbase', hue = 'stayshealthy',  stat = 'probability', palette = ['k', 'indianred'])

ylabels = ['{:.0%}'.format(x) for x in g.get_yticks()] #  getting the y ticks and reformatting them as percent
g.set_yticklabels(ylabels)
plt.xlim(0.4,0.8)
xlabels = ['{:.0%}'.format(x) for x in g.get_xticks()] #  getting the x ticks and reformatting them as percent
g.set_xticklabels(xlabels)

plt.xlabel('predicted probability')
plt.ylabel('relative frequency')
plt.grid(linestyle = 'dotted')
# plt.legend(labelcolor = ['k', 'indianred'])
plt.title('Simple LPM model');

In [None]:
g = sns.histplot(data = df_health, x = 'pred_lpm', hue = 'stayshealthy',  stat = 'probability', palette = ['k', 'indianred'], binwidth = 0.05)

ylabels = ['{:.0%}'.format(x) for x in g.get_yticks()] #  getting the y ticks and reformatting them as percent
g.set_yticklabels(ylabels)
xlabels = ['{:.0%}'.format(x) for x in g.get_xticks()] #  getting the x ticks and reformatting them as percent
g.set_xticklabels(xlabels)

plt.xlabel('predicted probability')
plt.ylabel('relative frequency')
plt.grid(linestyle = 'dotted')
plt.title('Extended LPM model');

#### Model comparison: mean & median

In [None]:
df_health.groupby("stayshealthy")[
    ["pred_lpmbase", "pred_lpm", "pred_logit", "pred_probit"]
].mean().round(3)

In [None]:
df_health.groupby("stayshealthy")[
    ["pred_lpmbase", "pred_lpm", "pred_logit", "pred_probit"]
].median().round(3)

#### Model comparison: goodness of fit

In [None]:
pd.DataFrame(
    {
        "R-squared": [
            lpm3.rsquared,
            r2_score(df_health["stayshealthy"], df_health["pred_logit"]),
            r2_score(df_health["stayshealthy"], df_health["pred_probit"]),
        ],
        "Brier-score": [
            mean_squared_error(df_health["stayshealthy"], df_health["pred_lpm"]),
            mean_squared_error(df_health["stayshealthy"], df_health["pred_logit"]),
            mean_squared_error(df_health["stayshealthy"], df_health["pred_probit"]),
        ],
        "Pseudo R-squared": [np.nan, logit_result.prsquared, probit_result.prsquared],
        "Log-loss": [
            -1 * log_loss(df_health["stayshealthy"], df_health["pred_lpm"]),
            -1 * log_loss(df_health["stayshealthy"], df_health["pred_logit"]),
            -1 * log_loss(df_health["stayshealthy"], df_health["pred_probit"]),
        ],
    },
    index=["LPM", "Logit", "Probit"],
).T.round(3)

#### Calibration plots

**Calibration plot components one-by-one**

In [None]:
n_bins = 10
prob_var = 'pred_logit'
actual_var = 'stayshealthy'

In [None]:
breaks = np.around(
            np.linspace(0, (n_bins + 1) / 10, num=n_bins + 1, endpoint=False),
            decimals=1,
        ).tolist()
breaks

In [None]:
df_ = df_health.copy()

In [None]:
df_["prob_bin"] = pd.cut(df_[prob_var], breaks, right=True, include_lowest=True)

In [None]:
df_["prob_bin"].value_counts().sort_index()

In [None]:
df_[[actual_var, prob_var, 'prob_bin']].iloc[0:10]

***Beware with the tails of your calibration curve!!!***

predicted p > 90%

In [None]:
df_[['stayshealthy', 'pred_logit', 'prob_bin']][df_.pred_logit > 0.90]['stayshealthy'].mean()

predicted p > 92%

In [None]:
df_[['stayshealthy', 'pred_logit', 'prob_bin']][df_.pred_logit > 0.92]['stayshealthy'].mean()

In [None]:
df_[['stayshealthy', 'pred_logit', 'prob_bin']][df_.pred_logit > 0.90]

In [None]:
df_binned_data = (
        df_.groupby("prob_bin")
        .agg(
            mean_prob = (prob_var, "mean"),
            mean_actual = (actual_var, "mean"),
            n= (actual_var, "size"),
        )
        .reset_index()
    )
df_binned_data

In [None]:
fig, ax = plt.subplots(figsize = (6,6))
ax.plot(breaks, breaks, color = 'k')
ax.plot(df_binned_data.mean_prob, df_binned_data.mean_actual, marker = 'o', color = 'indianred')

ax.set_xticks(np.linspace(0,1, num = 11))
ax.set_xticklabels(['{:.0%}'.format(x) for x in np.linspace(0,1, num = 11)])
ax.set_yticks(np.linspace(0,1, num = 11))
ax.set_yticklabels(['{:.0%}'.format(y) for y in np.linspace(0,1, num = 11)])
plt.grid(linestyle = "dotted")
plt.xlabel('predicted probability')
plt.ylabel('actual probability')
plt.title('Logit model calibration curve')
plt.show()

**Putting it alltogether: creating a function to produce the calibration plot**

In [None]:
def calibration_plot(
    df: pd.DataFrame,
    prob_var: str,
    actual_var: str,
    model_name: str = None,
    n_bins: int = 10,
    breaks: list = None):

    if breaks is None:
        breaks = np.linspace(0, 1, n_bins + 1).tolist()

    df["prob_bin"] = pd.cut(df[prob_var], breaks, right=True, include_lowest=True)
    
    df_binned_data = (
        df.groupby("prob_bin")
        .agg(
            mean_prob = (prob_var, "mean"),
            mean_actual = (actual_var, "mean"),
            n= (actual_var, "size"),
        )
        .reset_index()
    )

    fig, ax = plt.subplots(figsize = (6,6))
    ax.plot(breaks, breaks, color = 'k')
    ax.plot(df_binned_data.mean_prob, df_binned_data.mean_actual, marker = 'o', color = 'indianred')
    
    ax.set_xticks(np.linspace(0,1, num = 11))
    ax.set_xticklabels(['{:.0%}'.format(x) for x in np.linspace(0,1, num = 11)])
    ax.set_yticks(np.linspace(0,1, num = 11))
    ax.set_yticklabels(['{:.0%}'.format(y) for y in np.linspace(0,1, num = 11)])
    plt.grid(linestyle = "dotted")
    plt.xlabel('predicted probability')
    plt.ylabel('actual probability')
    if model_name is None:
        plt.title('Calibration curve')
    else:
        plt.title(f'{model_name} calibration curve')
    plt.show()

    


In [None]:
calibration_plot(
    df = df_health, 
    prob_var= 'pred_logit',
    actual_var = 'stayshealthy',
    model_name= 'Logit',
    n_bins= 12
)

In [None]:
calibration_plot(
    df = df_health, 
    prob_var= 'pred_probit',
    actual_var = 'stayshealthy',
    model_name= 'Probit',
    n_bins= 10
)