In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
import scipy.stats as st
from sklearn import metrics

In [None]:
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
sns.set_theme()
plt.rcParams['figure.figsize'] = [8,8]

In [None]:
chronic = pd.read_csv("../datasets/chronic.csv")
chronic

In [None]:
sns.scatterplot(data=chronic, x="Age", y="Condition")
# plt.savefig("chronic_plot.png")

In [None]:
prop = chronic.Condition.mean()
( chronic.groupby("Condition")
 .agg(Number = ("Condition","size"))
 .assign(Proportion = [ 1-prop, prop ]) )

In [None]:
cohorts = (chronic.groupby("Age", as_index=False)
           .agg(Number = ("Condition","size"), 
                Rate = ("Condition","mean")))
cohorts

In [None]:
sns.scatterplot(data=cohorts, x="Age", y="Rate")
# plt.savefig("chronic_age_cohorts.png")

In [None]:
chronic["Decade"] = 10*round(chronic.Age / 10)
decades = (chronic.groupby("Decade", as_index=False)
           .agg(Total=("Condition","size"), 
                Afflicted=("Condition","sum"),
                Rate=("Condition","mean")))
decades

In [None]:
sns.scatterplot(data=decades, x="Decade", y="Rate")
# plt.savefig("chronic_cohorts.png")

In [None]:
(sns.regplot(data=chronic,
             x="Age", y="Condition",
             logistic=True, scatter=False,
             ci=None, line_kws={"lw":"4"}))
plt.xlim(-5,105)
plt.ylabel("Rate")
plt.plot( cohorts["Age"], cohorts["Rate"],'.k')
plt.plot( decades["Decade"], decades["Rate"],'or',ms=6)
# plt.savefig("ages_decades.png")

In [None]:
chronic_model = smf.logit("Condition ~ Age", data=chronic)
chronic_fit = chronic_model.fit()
chronic_fit.params

In [None]:
print( chronic_fit.summary() )

In [None]:
chronic["Probability"] = chronic_fit.predict( chronic.Age )
chronic

In [None]:
z = np.linspace(-5,5,1000)
sigma = 1 / ( 1 + np.exp(-z))
plt.plot(z, sigma)
plt.xlabel("z")
plt.ylabel(r"$\sigma(z)$")
plt.title("The logistic function")
# plt.savefig("logit.png")

In [None]:
chronic["Class"] = (chronic.Probability > 0.50).astype(int)
chronic

In [None]:
pd.crosstab(chronic.Condition, chronic.Class)

In [None]:
pd.crosstab(chronic.Condition, chronic.Class, normalize='index')

In [None]:
np.mean( chronic.Condition == chronic.Class )

In [None]:
np.mean( chronic.Condition != chronic.Class )

In [None]:
chronic["Class"] = (chronic.Probability > 0.80).astype(int)
chronic

In [None]:
pd.crosstab(chronic.Condition, chronic.Class)

In [None]:
pd.crosstab(chronic.Condition, chronic.Class, normalize='index')

In [None]:
np.mean( chronic.Condition == chronic.Class )

In [None]:
np.mean( chronic.Condition != chronic.Class )

In [None]:
chronic["Class"] = (chronic.Probability > 0.20).astype(int)
chronic

In [None]:
pd.crosstab(chronic.Condition, chronic.Class)

In [None]:
pd.crosstab(chronic.Condition, chronic.Class, normalize='index')

In [None]:
np.mean( chronic.Condition == chronic.Class )

In [None]:
np.mean( chronic.Condition != chronic.Class )

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(chronic.Condition, chronic.Probability)
chronic_auc = metrics.auc(fpr, tpr)
chronic_auc

In [None]:
plt.figure()
plt.plot(fpr, tpr, label='ROC curve    AUC: %0.2f' % chronic_auc)
plt.plot([0,1], [0,1], 'r--', label='Random classification')
plt.plot([0.735,0.377,0.09], [0.975,0.832,0.349], 'ok')
plt.xlabel('False Positive Rate (1-Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC curve for chronic condition classifier')
plt.legend(loc="lower right")
# plt.savefig("chronic_roc.png")

In [None]:
cohorts = (chronic.groupby("Age", as_index=False)
           .agg(Number = ("Condition","size"), 
                Successes = ("Condition","sum"),
                Percentage = ("Condition","mean")))
cohorts.insert(3,"Failures", cohorts["Number"] - cohorts["Successes"])
cohorts

In [None]:
cohorts.insert(0,"Bias",1)
cohorts

In [None]:
cohorts_null_model = sm.GLM( cohorts[["Successes","Failures"]], cohorts["Bias"], family=sm.families.Binomial())
cohorts_null_fit = cohorts_null_model.fit()
print(cohorts_null_fit.summary())

In [None]:
null_odds = np.exp( cohorts_null_fit.params["Bias"] )
pi_null = null_odds / (1 + null_odds)
pi_null
cohorts["Null deviance"] = (2*( cohorts["Successes"]*np.log( cohorts["Percentage"]/pi_null ) + 
                               cohorts["Failures"]*np.log( (1 - cohorts["Percentage"] )/(1 - pi_null) ) ))
cohorts

In [None]:
np.sum( cohorts["Null deviance"])

In [None]:
cohorts_model = sm.GLM( cohorts[["Successes","Failures"]], cohorts[["Bias","Age"]], family=sm.families.Binomial())
cohorts_fit = cohorts_model.fit()
print(cohorts_fit.summary())

In [None]:
cohorts["Probability"] = cohorts_fit.predict()
cohorts["Deviance"] = (2*( cohorts["Successes"]*np.log( cohorts["Percentage"]/cohorts["Probability"] ) + 
                cohorts["Failures"]*np.log( (1 - cohorts["Percentage"] )/(1 - cohorts["Probability"]) ) ))
cohorts

In [None]:
np.sum( cohorts["Deviance"])

In [None]:
G2_cohorts = cohorts_null_fit.deviance - cohorts_fit.deviance
G2_cohorts

In [None]:
1 - st.chi2.cdf( G2_cohorts, df=1 )

In [None]:
cohorts_fit.deviance

In [None]:
cohorts_fit.df_resid

In [None]:
1 - st.chi2.cdf( cohorts_fit.deviance, cohorts_fit.df_resid )

In [None]:
cohorts_fit.pearson_chi2

In [None]:
1 - st.chi2.cdf(cohorts_fit.pearson_chi2,  cohorts_fit.df_resid)

In [None]:
cohort_residuals = pd.DataFrame( {"Age" : cohorts["Age"],
                                  "Pearson" : cohorts_fit.resid_pearson,
                                  "Deviance" : cohorts_fit.resid_deviance} )
cohort_residuals

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2)
ax1.plot( cohort_residuals["Age"], cohort_residuals["Pearson"],'ok')
ax1.set_title("Pearson residuals")
ax1.set_xlabel("Age")
ax2.plot( cohort_residuals["Age"], cohort_residuals["Deviance"],'ok')
ax2.set_title("Deviance residuals")
ax2.set_xlabel("Age")
# plt.savefig("cohort_residuals.png")