In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, RidgeClassifierCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_diabetes

In [2]:
data = load_diabetes()

print(data['data'].shape, data['target'].shape) ## 10개의 변수(features)가 있는 다중선형회귀 문제

(442, 10) (442,)


In [3]:
print(data['DESCR'])

.. _diabetes_dataset:

Diabetes dataset
----------------

Ten baseline variables, age, sex, body mass index, average blood
pressure, and six blood serum measurements were obtained for each of n =
442 diabetes patients, as well as the response of interest, a
quantitative measure of disease progression one year after baseline.

**Data Set Characteristics:**

  :Number of Instances: 442

  :Number of Attributes: First 10 columns are numeric predictive values

  :Target: Column 11 is a quantitative measure of disease progression one year after baseline

  :Attribute Information:
      - age     age in years
      - sex
      - bmi     body mass index
      - bp      average blood pressure
      - s1      tc, total serum cholesterol
      - s2      ldl, low-density lipoproteins
      - s3      hdl, high-density lipoproteins
      - s4      tch, total cholesterol / HDL
      - s5      ltg, possibly log of serum triglycerides level
      - s6      glu, blood sugar level

Note: Each of these 1

In [4]:
data = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", sep='\t')

In [5]:
data.describe()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,48.5181,1.468326,26.375792,94.647014,189.140271,115.43914,49.788462,4.070249,4.641411,91.260181,152.133484
std,13.109028,0.499561,4.418122,13.831283,34.608052,30.413081,12.934202,1.29045,0.522391,11.496335,77.093005
min,19.0,1.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0
25%,38.25,1.0,23.2,84.0,164.25,96.05,40.25,3.0,4.2767,83.25,87.0
50%,50.0,1.0,25.7,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5
75%,59.0,2.0,29.275,105.0,209.75,134.5,57.75,5.0,4.9972,98.0,211.5
max,79.0,2.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0


In [6]:
# X's & Y Split
Y = data['Y']
X = data.drop(columns=['Y']) 
X = pd.get_dummies(X, columns=['SEX'])

In [7]:
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))

>>>> # of Train data : 309
>>>> # of valid data : 133


In [8]:
results = LinearRegression().fit(X.iloc[train_idx], Y.iloc[train_idx])

In [9]:
import scipy
from sklearn import metrics

def sse(clf, X, y):
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]


def adj_r2_score(clf, X, y):
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y):
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y):
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y):
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))

In [10]:
summary(results, X.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept -353.422717  3.804036e+08-4.000000e-06j -0.0000-0.0000j  0.999999
AGE          -0.241046  1.995540e-01+2.787600e-02j -1.1848+0.1655j  0.233720
BMI           5.364734  1.269539e+00+0.000000e+00j  4.2257-0.0000j  0.000044
BP            0.973515  3.311130e-01+3.762900e-02j  2.9026-0.3299j  0.004101
S1           -1.128987  2.891240e-01+7.867900e-02j -3.6356+0.9894j  0.000247
S2            0.935342  3.622840e-01+2.493000e-03j  2.5817-0.0178j  0.010922
S3            0.295834  4.192620e-01+1.264800e-02j  0.7050-0.0213j  0.481875
S4            2.577375  1.011893e+01-2.100000e-05j  0.2547+0.0000j  0.799345
S5           72.840272  2.201149e+01-1.900000e-05j  3.3092+0.0000j  0.001206
S6            0.292290  4.722150e-01-2.509200e-02j  0.6172+0.0328j  0.537571
SEX_1        10.444984  3.804036e+08+1.270000e-04j  0.0000-0.0000j  1.000000
SEX_2       -10.444984  3.804036e+08+8.000000e-05j -0.0000+0.0

In [11]:
# Scaling
scaler = MinMaxScaler().fit(X.iloc[train_idx])
X_scal = scaler.transform(X)
X_scal = pd.DataFrame(X_scal, columns=X.columns)

In [12]:
# Linear Regression
results = LinearRegression().fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
summary(results, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scal.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept   -2.765884  3.804036e+08+1.320801e+00j -0.0000+0.0000j  1.000000
AGE         -14.462769  2.343980e+01+2.360180e-01j -0.6170+0.0062j  0.538306
BMI         127.144195  3.172412e+01+1.216340e-01j  4.0077-0.0154j  0.000102
BP           67.172560  2.793551e+01+2.432630e-01j  2.4044-0.0209j  0.017584
S1         -230.313267  1.635382e+02+2.217170e-01j -1.4083+0.0019j  0.161389
S2          162.188278  1.148588e+02+5.174060e-01j  1.4120-0.0064j  0.160288
S3           22.483360  7.294744e+01-1.333060e-01j  0.3082+0.0006j  0.758406
S4           16.185916  5.709145e+01+1.649440e-01j  0.2835-0.0008j  0.777232
S5          207.514650  5.019652e+01+8.953900e-02j  4.1340-0.0074j  0.000063
S6           19.291168  3.394613e+01+2.599510e-01j  0.5683-0.0044j  0.570816
SEX_1        10.444984  3.804036e+08+1.014508e+00j  0.0000-0.0000j  1.000000
SEX_2       -10.444984  3.804036e+08+1.692700e-02j -0.0000+0.0

[Ridge Regression]
 - Hyperparameter Tuning using for Loop
 - Hyperparameter Tuning using GridSearchCV

[Ridge Regression Parameters]
   - Package : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
   - alpha : L2-norm Penalty Term 
     - alpha : 0 일 때, Just Linear Regression
   - fit_intercept : Centering to zero
     - 베타0를 0로 보내는 것 (베타0는 상수이기 때문에)
   - max_iter : Maximum number of interation
     - Loss Function의 Ridge Penalty Term은 Closed Form 값이기는 하지만 값을 찾아 나감
     - Penalty Term : (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_2

In [13]:
penelty = [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1, 10]

# Using For Loop !! 
# Ridge Regression
# select alpha by checking R2, MSE, RMSE
for a in penelty:
    model = Ridge(alpha=a).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
    score = model.score(X_scal.iloc[valid_idx], Y.iloc[valid_idx])
    pred_y = model.predict(X_scal.iloc[valid_idx])
    mse = mean_squared_error(Y.iloc[valid_idx], pred_y)
    print("Alpha:{0:.5f}, R2:{1:.7f}, MSE:{2:.7f}, RMSE:{3:.7f}".format(a, score, mse, np.sqrt(mse))) 

Alpha:0.00001, R2:0.5301655, MSE:3084.6095744, RMSE:55.5392616
Alpha:0.00005, R2:0.5301672, MSE:3084.5988322, RMSE:55.5391648
Alpha:0.00010, R2:0.5301692, MSE:3084.5854446, RMSE:55.5390443
Alpha:0.00100, R2:0.5302048, MSE:3084.3519225, RMSE:55.5369420
Alpha:0.01000, R2:0.5304637, MSE:3082.6522133, RMSE:55.5216373
Alpha:0.10000, R2:0.5304511, MSE:3082.7343410, RMSE:55.5223769
Alpha:0.30000, R2:0.5294946, MSE:3089.0142040, RMSE:55.5789007
Alpha:0.50000, R2:0.5285641, MSE:3095.1232917, RMSE:55.6338323
Alpha:0.60000, R2:0.5280578, MSE:3098.4477026, RMSE:55.6637018
Alpha:0.70000, R2:0.5275205, MSE:3101.9751863, RMSE:55.6953785
Alpha:0.90000, R2:0.5263592, MSE:3109.5989675, RMSE:55.7637783
Alpha:1.00000, R2:0.5257398, MSE:3113.6657269, RMSE:55.8002305
Alpha:10.00000, R2:0.4513724, MSE:3601.9109235, RMSE:60.0159222


In [14]:
model_best = Ridge(alpha=0.01).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
summary(model_best, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels = X_scal.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept    3.351510  3.802827e+08+0.000000e+00j  0.0000-0.0000j  1.000000
AGE         -14.279970  2.355360e+01+0.000000e+00j -0.6063+0.0000j  0.545373
BMI         127.331685  3.172437e+01+0.000000e+00j  4.0137-0.0000j  0.000100
BP           67.279481  2.797563e+01+0.000000e+00j  2.4049-0.0000j  0.017563
S1         -203.277222  1.635086e+02+0.000000e+00j -1.2432+0.0000j  0.215990
S2          141.209566  1.149112e+02+0.000000e+00j  1.2289-0.0000j  0.221311
S3           10.619292  7.291556e+01+0.000000e+00j  0.1456-0.0000j  0.884429
S4           13.868910  5.708001e+01+0.000000e+00j  0.2430-0.0000j  0.808403
S5          198.003880  5.019348e+01+0.000000e+00j  3.9448-0.0000j  0.000129
S6           19.251667  3.401033e+01-0.000000e+00j  0.5661+0.0000j  0.572318
SEX_1        10.473795  3.802827e+08+0.000000e+00j  0.0000-0.0000j  1.000000
SEX_2       -10.473795  3.802827e+08-0.000000e+00j -0.0000-0.0

In [15]:
# Using GridSearchCV
ridge_cv=RidgeCV(alphas=penelty, cv=5)
model = ridge_cv.fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
print("Best Alpha:{0:.5f}, R2:{1:.4f}".format(model.alpha_, model.best_score_))

Best Alpha:0.90000, R2:0.4419


In [16]:
# GridSearchCV Result
model_best = Ridge(alpha=model.alpha_).fit(X_scal.iloc[train_idx], Y.iloc[train_idx])
score = model_best.score(X_scal.iloc[valid_idx], Y.iloc[valid_idx])
pred_y = model_best.predict(X_scal.iloc[valid_idx])
mse = np.sqrt(mean_squared_error(Y.iloc[valid_idx], pred_y))
print("Alpha:{0:.5f}, R2:{1:.7f}, MSE:{2:.7f}, RMSE:{3:.7f}".format(0.01, score, mse, np.sqrt(mse)))
summary(model_best, X_scal.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scal.columns)

Alpha:0.01000, R2:0.5263592, MSE:55.7637783, RMSE:7.4675149
Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept   42.643650  3.819412e+08+1.464058e+00j  0.0000-0.0000j  1.000000
AGE         -10.492753  2.368453e+01+1.956490e-01j -0.4430+0.0037j  0.658486
BMI         121.677019  3.185811e+01+1.285190e-01j  3.8193-0.0154j  0.000205
BP           66.649695  2.808212e+01+2.461690e-01j  2.3732-0.0208j  0.019072
S1          -21.091765  1.642219e+02+2.280180e-01j -0.1284+0.0002j  0.898001
S2           -5.450923  1.153438e+02+5.460120e-01j -0.0473+0.0002j  0.962379
S3          -55.208206  7.324488e+01-1.457720e-01j -0.7537-0.0015j  0.452344
S4           17.858217  5.734113e+01+1.642880e-01j  0.3114-0.0009j  0.755960
S5          119.080753  5.042164e+01+8.524500e-02j  2.3617-0.0040j  0.019654
S6           22.603065  3.409545e+01+2.738940e-01j  0.6629-0.0053j  0.508542
SEX_1        10.489062  3.819412e+08+1.233104e+00j  0.0000-0.0000j  1.000000
SE