In [1]:
import scipy
import numpy as np
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler ## min max scaling은 데이터에 노이즈(이상치)가 없을 때 좋음. standard scaling은 0을 중심으로 scaling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt", sep='\t')

In [3]:
data.describe()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
count,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0,442.0
mean,48.5181,1.468326,26.375792,94.647014,189.140271,115.43914,49.788462,4.070249,4.641411,91.260181,152.133484
std,13.109028,0.499561,4.418122,13.831283,34.608052,30.413081,12.934202,1.29045,0.522391,11.496335,77.093005
min,19.0,1.0,18.0,62.0,97.0,41.6,22.0,2.0,3.2581,58.0,25.0
25%,38.25,1.0,23.2,84.0,164.25,96.05,40.25,3.0,4.2767,83.25,87.0
50%,50.0,1.0,25.7,93.0,186.0,113.0,48.0,4.0,4.62005,91.0,140.5
75%,59.0,2.0,29.275,105.0,209.75,134.5,57.75,5.0,4.9972,98.0,211.5
max,79.0,2.0,42.2,133.0,301.0,242.4,99.0,9.09,6.107,124.0,346.0


In [4]:
Y = data['Y']
X = data.drop(columns=['Y'])
X = pd.get_dummies(X, columns=['SEX']) ## 성별은 categorical 변수인데 1과 2로 되어 있으면 2가 더 큰 변수로 간주된다. 따라서 man, woman으로 컬럼을 분할하고 해당하는 성별은 1 그렇지 않으면 0으로 구성.

In [5]:
X.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,59,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,0,1
1,48,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,1,0
2,72,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,0,1
3,24,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,1,0
4,50,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,1,0


In [6]:
print(X.shape)
print(Y.shape)

(442, 11)
(442,)


In [7]:
"""
좋지 못한 코드.
이렇게 데이터셋을 분할하게 되면 전체 데이터셋, train, test를 모두 메모리에 점유하게 되므로 데이터셋이 큰 경우 손해.
"""
train, test = train_test_split(data, test_size=0.3, random_state=42, shuffle=True)
print(train.shape, test.shape)

del(train)
del(test)

(309, 11) (133, 11)


In [8]:
## 인덱스를 추출해서 데이터셋을 분할하는 것이 훨씬 효과적

idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))

>>>> # of Train data : 309
>>>> # of valid data : 133


In [9]:
results = LinearRegression().fit(X.iloc[train_idx], Y.iloc[train_idx])

In [10]:
def sse(clf, X, y):
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]


def adj_r2_score(clf, X, y):
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y):
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y):
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y):
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))

결과를 해석하는 순서  
1. 모델의 R square 점수를 보고 정성적으로 모델이 신뢰할 수준인지 판단.
2. p-value(0.05) 보다 낮은 확률의 feature를 선별 -> [BMI, BP, S1, S2, S5]
3. scaling이 적용되지 않았기 때문에 x_i가 1단위 증가했을 때 y에 미치는 영향이 어느정도인지 파악만 가능하고, feature간 상대적 중요도 비교는 불가능.

In [11]:
summary(results, X.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept -353.422717  3.804036e+08-4.000000e-06j -0.0000-0.0000j  0.999999
AGE          -0.241046  1.995540e-01+2.787600e-02j -1.1848+0.1655j  0.233720
BMI           5.364734  1.269539e+00+0.000000e+00j  4.2257-0.0000j  0.000044
BP            0.973515  3.311130e-01+3.762900e-02j  2.9026-0.3299j  0.004101
S1           -1.128987  2.891240e-01+7.867900e-02j -3.6356+0.9894j  0.000247
S2            0.935342  3.622840e-01+2.493000e-03j  2.5817-0.0178j  0.010922
S3            0.295834  4.192620e-01+1.264800e-02j  0.7050-0.0213j  0.481875
S4            2.577375  1.011893e+01-2.100000e-05j  0.2547+0.0000j  0.799345
S5           72.840272  2.201149e+01-1.900000e-05j  3.3092+0.0000j  0.001206
S6            0.292290  4.722150e-01-2.509200e-02j  0.6172+0.0328j  0.537571
SEX_1        10.444984  3.804036e+08+1.270000e-04j  0.0000-0.0000j  1.000000
SEX_2       -10.444984  3.804036e+08+8.000000e-05j -0.0000+0.0

In [12]:
## Scaling을 할 때는 전체 데이터셋으로 하는 것이 아니라, train set만으로 scaling을 해야한다.

scaler = MinMaxScaler().fit(X.iloc[train_idx])
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [13]:
X_scaled.head()

Unnamed: 0,AGE,BMI,BP,S1,S2,S3,S4,S5,S6,SEX_1,SEX_2
0,0.666667,0.57384,0.565217,0.294118,0.297578,0.197368,0.318471,0.562217,0.439394,0.0,1.0
1,0.483333,0.130802,0.362319,0.421569,0.355248,0.618421,0.159236,0.222437,0.166667,1.0,0.0
2,0.883333,0.506329,0.449275,0.289216,0.299885,0.236842,0.318471,0.496578,0.409091,0.0,1.0
3,0.083333,0.28692,0.318841,0.495098,0.517878,0.223684,0.477707,0.572923,0.469697,1.0,0.0
4,0.516667,0.189873,0.565217,0.465686,0.483276,0.381579,0.318471,0.362385,0.333333,1.0,0.0


이번에는 스케일링을 적용했기 때문에 feature마다 beta값의 크기를 기준으로 상대적인 중요도를 비교 할 수 있음.    
1. S5
2. BMI
3. BP

In [14]:
results = LinearRegression().fit(X_scaled.iloc[train_idx], Y.iloc[train_idx])
summary(results, X_scaled.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X_scaled.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept   -2.765884  3.804036e+08+1.320801e+00j -0.0000+0.0000j  1.000000
AGE         -14.462769  2.343980e+01+2.360180e-01j -0.6170+0.0062j  0.538306
BMI         127.144195  3.172412e+01+1.216340e-01j  4.0077-0.0154j  0.000102
BP           67.172560  2.793551e+01+2.432630e-01j  2.4044-0.0209j  0.017584
S1         -230.313267  1.635382e+02+2.217170e-01j -1.4083+0.0019j  0.161389
S2          162.188278  1.148588e+02+5.174060e-01j  1.4120-0.0064j  0.160288
S3           22.483360  7.294744e+01-1.333060e-01j  0.3082+0.0006j  0.758406
S4           16.185916  5.709145e+01+1.649440e-01j  0.2835-0.0008j  0.777232
S5          207.514650  5.019652e+01+8.953900e-02j  4.1340-0.0074j  0.000063
S6           19.291168  3.394613e+01+2.599510e-01j  0.5683-0.0044j  0.570816
SEX_1        10.444984  3.804036e+08+1.014508e+00j  0.0000-0.0000j  1.000000
SEX_2       -10.444984  3.804036e+08+1.692700e-02j -0.0000+0.0