



















# 1. 导入包

In [1]:
# -*- coding: utf-8 -*-
"""
Python 3.7.7
sklearn 0.23.1
使用岭回归根据多个因素预测医疗费用
岭回归使用L2正则化
"""

# 导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

# 2. 导入数据集

In [2]:
# 导入数据集
data = pd.read_csv('insurance.csv')

# 3. 数据预处理

## 3.1 检测缺失值

In [3]:
# 检测缺失值
null_df = data.isnull().sum()

## 3.2 标签编码&独热编码

In [4]:
# 标签编码&独热编码
data = pd.get_dummies(data, drop_first = True) 

## 3.3 得到自变量和因变量

In [5]:
# 得到自变量和因变量
y = data['charges'].values
data = data.drop(['charges'], axis = 1)
x = data.values

## 3.4 拆分训练集和测试集

In [6]:
# 拆分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

# 4. 构建不同参数的岭回归模型

## 4.1 模型1：构建岭回归模型

### 4.1.1 构建岭回归模型

In [7]:
# 构建不同参数的岭回归模型
# 模型1：构建岭回归模型（alpha = 20）
regressor = Ridge(alpha = 20, normalize = True, fit_intercept = True)
regressor.fit(x_train, y_train)

Ridge(alpha=20, normalize=True)

### 4.1.2 得到数学表达式

In [8]:
# 得到数学表达式
print('数学表达式是：\n Charges = ', end='')
columns = data.columns
coefs = regressor.coef_
for i in range(len(columns)):
    print('%s * %.2f + ' %(columns[i], coefs[i]), end='')
print(regressor.intercept_)

数学表达式是：
 Charges = age * 12.48 + bmi * 17.21 + children * 14.86 + sex_male * 60.23 + smoker_yes * 1121.22 + region_northwest * -34.52 + region_southeast * 61.62 + region_southwest * -33.53 + 11938.446490743021


### 4.1.3 预测测试集

In [9]:
# 预测测试集
y_pred = regressor.predict(x_test)

### 4.1.4 得到模型MSE

In [10]:
# 得到模型MSE
mse_score = mean_squared_error(y_test, y_pred)
print('alpha=1时，岭回归模型的MSE是：%.2f' %(mse_score)) # 65,016,470.50

alpha=1时，岭回归模型的MSE是：138769173.13


## 4.2 模型2：构建岭回归模型

In [11]:
# 模型2：构建岭回归模型（alpha = 0.1）
regressor = Ridge(alpha = 0.1, normalize = True, fit_intercept = True)
regressor.fit(x_train, y_train)

Ridge(alpha=0.1, normalize=True)

In [12]:
# 得到线性表达式
print('数学表达式是：\n Charges = ', end='')
columns = data.columns
coefs = regressor.coef_
for i in range(len(columns)):
    print('%s * %.2f + ' %(columns[i], coefs[i]), end='')
print(regressor.intercept_)

数学表达式是：
 Charges = age * 234.53 + bmi * 291.63 + children * 361.72 + sex_male * -88.02 + smoker_yes * 21586.00 + region_northwest * -266.87 + region_southeast * -672.40 + region_southwest * -691.71 + -9237.600606458109


In [13]:
# 预测测试集
y_pred = regressor.predict(x_test)

In [14]:
# 得到模型的MSE
mse_score = mean_squared_error(y_test, y_pred)
print('alpha=0.1时，岭回归模型的MSE是：%.2f' %(mse_score)) # 36,841,099.27

alpha=0.1时，岭回归模型的MSE是：36841099.27


## 4.3 模型3：构建岭回归模型

In [15]:
# 模型3：构建岭回归模型（alpha = 0.01）
regressor = Ridge(alpha = 0.01, normalize = True, fit_intercept = True)
regressor.fit(x_train, y_train)

Ridge(alpha=0.01, normalize=True)

In [16]:
# 得到线性表达式
print('数学表达式是：\n Charges = ', end='')
columns = data.columns
coefs = regressor.coef_
for i in range(len(columns)):
    print('%s * %.2f + ' %(columns[i], coefs[i]), end='')
print(regressor.intercept_)

数学表达式是：
 Charges = age * 255.00 + bmi * 318.27 + children * 402.86 + sex_male * -223.99 + smoker_yes * 23546.28 + region_northwest * -377.66 + region_southeast * -992.59 + region_southwest * -875.29 + -11075.028462288014


In [17]:
# 预测测试集
y_pred = regressor.predict(x_test)

In [18]:
# 得到模型的MSE
mse_score = mean_squared_error(y_test, y_pred)
print('alpha=0.01时，岭回归模型的MSE是：%.2f' %(mse_score)) # 35,539,055.33

alpha=0.01时，岭回归模型的MSE是：35539055.33


## 4.4 模型4：构建岭回归模型

In [19]:
# 模型4：构建岭回归模型（alpha = 0.0001）
regressor = Ridge(alpha = 0.0001, normalize = True, fit_intercept = True)
regressor.fit(x_train, y_train)

Ridge(alpha=0.0001, normalize=True)

In [20]:
# 得到线性表达式
print('数学表达式是：\n Charges = ', end='')
columns = data.columns
coefs = regressor.coef_
for i in range(len(columns)):
    print('%s * %.2f + ' %(columns[i], coefs[i]), end='')
print(regressor.intercept_)

数学表达式是：
 Charges = age * 257.47 + bmi * 321.59 + children * 408.01 + sex_male * -241.97 + smoker_yes * 23784.06 + region_northwest * -395.90 + region_southeast * -1037.90 + region_southwest * -902.75 + -11295.364555495733


In [21]:
# 预测测试集
y_pred = regressor.predict(x_test)

In [22]:
# 得到模型的MSE
mse_score = mean_squared_error(y_test, y_pred)
print('alpha=0.0001时，岭回归模型的MSE是：%.2f' %(mse_score)) # 35,479,846.30

alpha=0.0001时，岭回归模型的MSE是：35479846.30


#### 结论：
由上面4个模型可见，不同的模型超参数对岭回归模型性能的影响不同。