# 第 6 章　广义线性模型｜用 Python 动手学统计学

## 第 4 节　广义线性模型的评估

### 1. 环境准备

In [3]:
# 用于数值计算的库
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats

# 用于绘图的库
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()

# 用于估计统计模型的库 (部分版本会报出警告信息)
import statsmodels.formula.api as smf
import statsmodels.api as sm

# 设置浮点数打印精度
%precision 3
# 在 Jupyter Notebook 里显示图形
%matplotlib inline


In [4]:
# 读取数据
url = "https://raw.github.com/pineapple-666/Learn-Statistics-with-Python/main/data/6-3-1-logistic-regression.csv"
test_result = pd.read_csv(url)

# 模型化
mod_glm = smf.glm("result ~ hours", data = test_result,
                  family=sm.families.Binomial()).fit()

### 4. 皮尔逊残差

In [6]:
# 计算皮尔逊残差

# 预测的成功概率
pred = mod_glm.predict()
# 响应变量 (合格情况)
y = test_result.result

# 皮尔逊残差
peason_resid = (y - pred) / np.sqrt(pred * (1 - pred))
peason_resid.head(3)

Unnamed: 0,result
0,-0.102351
1,-0.102351
2,-0.102351


In [7]:
# 获取皮尔逊残差
mod_glm.resid_pearson.head(3)

Unnamed: 0,0
0,-0.102351
1,-0.102351
2,-0.102351


In [9]:
# 皮尔逊残差的平方和
np.sum(mod_glm.resid_pearson**2)

np.float64(84.91138782569973)

In [10]:
# 同样出现在 summary 函数的结果中
mod_glm.pearson_chi2

np.float64(84.91138782569973)

### 9. 偏差残差

In [12]:
# 计算偏差残差

# 预测的成功概率
pred = mod_glm.predict()
# 响应变量 (合格情况)
y = test_result.result

# 与完美预测了合格情况时的对数似然度的差值
resid_tmp = 0 - np.log(
    sp.stats.binom.pmf(k = y, n = 1, p = pred))
# 偏差残差
deviance_resid = np.sqrt(
    2 * resid_tmp
) * np.sign(y - pred)
# 打印结果
deviance_resid.head(3)

Unnamed: 0,result
0,-0.144369
1,-0.144369
2,-0.144369


In [13]:
mod_glm.resid_deviance.head(3)

Unnamed: 0,0
0,-0.144369
1,-0.144369
2,-0.144369


In [15]:
# deviance
np.sum(mod_glm.resid_deviance ** 2)

np.float64(68.02788118117269)