# 過度擬合與regularization
### 程式修改自 [Regularization of Linear Models with SKLearn](https://medium.com/coinmonks/regularization-of-linear-models-with-sklearn-f88633a93a2)

## 載入相關套件

In [1]:
import pandas as pd
import numpy as np

## 載入房價資料集

In [3]:
# 載入訓練資料
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('./data/train.csv', index_col='ID')

# 指定 X、Y
X = train_df.drop('medv', axis=1)
y = train_df['medv']

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3
                                                    , random_state=42)

In [4]:
X_train.shape, X_test.shape

((233, 13), (100, 13))

## 模型訓練與評分

In [5]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# 模型訓練
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

print(f'訓練結果評分: {lr_model.score(X_train, y_train)}')
print(f'測試結果評分: {lr_model.score(X_test, y_test)}')

# 模型評分
y_pred = lr_model.predict(X_test)

訓練結果評分: 0.7268827869293253
測試結果評分: 0.7254687959254533


## 生成新特徵，為舊特徵的平方

In [6]:
# 指定 X、Y
X = train_df.drop('medv', axis=1)
y = train_df['medv']

# 生成新特徵，為舊特徵的平方
X['crim_2'] = X['crim'] ** 2
X['zn_2'] = X['zn'] ** 2
X['indus_2'] = X['indus'] ** 2
X['chas_2'] = X['chas'] ** 2
X['nox_2'] = X['nox'] ** 2
X['rm_2'] = X['rm'] ** 2
X['age_2'] = X['age'] ** 2
X['dis_2'] = X['dis'] ** 2
X['rad_2'] = X['rad'] ** 2
X['tax_2'] = X['tax'] ** 2
X['ptratio_2'] = X['ptratio'] ** 2
X['black_2'] = X['black'] ** 2
X['lstat_2'] = X['lstat'] ** 2

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3
                                                    , random_state=42)

In [7]:
X_train.shape, X_test.shape

((233, 26), (100, 26))

## 模型訓練與評分

In [11]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

# 建立管線
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', LinearRegression())
]
pipeline = Pipeline(steps)

# 模型訓練
pipeline.fit(X_train, y_train)

# 模型評分
print(f'訓練結果: {pipeline.score(X_train, y_train)}')
print(f'測試結果: {pipeline.score(X_test, y_test)}')

訓練結果: 1.0
測試結果: -59.95522078093653


## l2 Regularization or Ridge Regression

In [12]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Ridge(alpha=10, fit_intercept=True))
]

ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

# 模型評分
print(f'訓練判定係數: {ridge_pipe.score(X_train, y_train)}')
print(f'測試判定係數: {ridge_pipe.score(X_test, y_test)}')

訓練判定係數: 0.9411030494647765
測試判定係數: 0.8158674422432348


## l1 Regularization or Lasso Regression

In [13]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso(alpha=0.3, fit_intercept=True))
]

lasso_pipe = Pipeline(steps)

lasso_pipe.fit(X_train, y_train)

# 模型評分
print(f'訓練判定係數: {lasso_pipe.score(X_train, y_train)}')
print(f'測試判定係數: {lasso_pipe.score(X_test, y_test)}')

訓練判定係數: 0.8525646297860277
測試判定係數: 0.8367938135279831
