In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
sns.set_theme()
plt.rcParams['figure.figsize'] = [8,8]

In [None]:
africa = pd.read_csv("../datasets/africa.csv")
n_countries = len( africa )
n_countries

In [None]:
africa

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
plt.title("Infant mortality rates (IMR) versus Literacy for young African women")
# plt.savefig("literacy_imr.png")

In [None]:
x_bar = africa.Literacy.mean()
x_bar

In [None]:
y_bar = africa.IMR.mean()
y_bar

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
plt.axvline( x_bar, linestyle='--')
plt.axhline( y_bar, linestyle='--')
plt.title("IMR vs Literacy: quadrants determined by averages")
# plt.savefig("quadrants.png")

In [None]:
africa_corr = africa.Literacy.corr( africa.IMR )
africa_corr

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
# null model:
plt.axhline( y_bar, color="black")
plt.title("IMR vs Literacy: Null model")
# plt.savefig("africa_null.png")

In [None]:
africa_tss = ((africa.IMR - y_bar)**2).sum()
africa_tss

In [None]:
africa.IMR.var()

In [None]:
(n_countries - 1)*africa.IMR.var()

In [None]:
s_x = africa.Literacy.std()
s_x

In [None]:
s_y = africa.IMR.std()
s_y

In [None]:
m_sd = -s_y / s_x 
m_sd

In [None]:
b_sd = y_bar - m_sd*x_bar
b_sd

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
# SD line
plt.axline( (x_bar, y_bar), slope = m_sd, linestyle='--', color="black")
plt.title("IMR vs Literacy: SD line")
# plt.savefig("literacy_imr_sdline.png")

In [None]:
# africa = africa.assign( null_resid = africa.IMR - y_bar, sd_pred = b_sd + m_sd*africa.Literacy, sd_resid = africa.IMR - b_sd - m_sd*africa.Literacy)

In [None]:
m = africa_corr*(s_y/s_x)
m

In [None]:
b = y_bar - m*x_bar
b

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
# line of best fit
plt.axline( (x_bar,y_bar), slope = m, color="black")
plt.title("IMR vs Literacy: Line of best fit")
# plt.savefig("literacy_imr_regression_line.png")

In [None]:
sns.scatterplot(data=africa, x="Literacy", y="IMR")
plt.axline( (x_bar,y_bar), slope = m, color="black")
plt.axline( (x_bar, y_bar), slope = m_sd, linestyle='--', color="black")
plt.title("IMR vs Literacy: Regression line (solid) and SD line (dashed)")
plt.plot(x_bar,y_bar,'ok')
# plt.savefig("literacy_imr_both_lines.png")

In [None]:
africa_model = smf.ols('IMR ~ Literacy', data = africa)

In [None]:
africa_fit = africa_model.fit()

In [None]:
africa_fit.params

In [None]:
africa_null = smf.ols('IMR ~ 1', data = africa)

In [None]:
africa_null = africa_null.fit()

In [None]:
africa_null.params

In [None]:
print( africa_fit.summary() )

In [None]:
africa_ssr = (africa_fit.resid**2).sum()
africa_ssr

In [None]:
africa_fit.ssr

In [None]:
africa_ssr / (n_countries - 2)

In [None]:
np.sqrt( africa_ssr / (n_countries - 2) )

In [None]:
africa_fit.scale

In [None]:
africa_rse = np.sqrt( africa_fit.scale )
africa_rse

In [None]:
africa_fit.rsquared

In [None]:
1 - (africa_ssr / africa_tss)

In [None]:
sns.residplot(data=africa, x="Literacy", y="IMR", line_kws={"color":"black"})
plt.axhline( 2*africa_rse, linestyle=":", color="black")
plt.axhline( -2*africa_rse, linestyle=":", color="black")
plt.title("IMR versus Literacy residuals")
# plt.savefig("literacy_imr_residuals.png")

In [None]:
import scipy.stats as st

In [None]:
st.norm.ppf(.975 )

In [None]:
st.t.ppf(.975, df=45)

In [None]:
st.t.ppf(.995, df=45)

In [None]:
2*st.t.cdf(-5.26,df=45)

In [None]:
africa_fit.conf_int()

In [None]:
new = pd.DataFrame(data = {"Literacy":[50,80]})
africa_fit.predict( new )

In [None]:
africa_fit.get_prediction( new ).summary_frame()

In [None]:
sns.regplot(data=africa, x="Literacy", y="IMR", line_kws={"color":"black"})
plt.title("IMR vs Literacy with confidence bands")
# plt.savefig("literacy_imr_with_ci.png")

In [None]:
x = africa.Literacy.sort_values(ignore_index=True)
endpts = africa_fit.get_prediction(x).summary_frame()

In [None]:
sns.regplot(data=africa, x="Literacy", y="IMR", line_kws={"color":"black"})
plt.plot(x, endpts.obs_ci_upper, ':', color="black")
plt.plot(x, endpts.obs_ci_lower, ':', color="black")
plt.title("IMR vs Literacy with confidence and prediction bands")
# plt.savefig("literacy_imr_ci_preds.png")