In [1]:
import scipy
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt', sep='\t')

In [3]:
Y = data['Y']
X = data.drop(columns=['Y']) 
X = pd.get_dummies(X, columns=['SEX'])

In [4]:
idx = list(range(X.shape[0]))
train_idx, valid_idx = train_test_split(idx, test_size=0.3, random_state=2023)
print(">>>> # of Train data : {}".format(len(train_idx)))
print(">>>> # of valid data : {}".format(len(valid_idx)))

>>>> # of Train data : 309
>>>> # of valid data : 133


In [5]:
# Scaling
scaler = MinMaxScaler().fit(X.iloc[train_idx])
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [6]:
def sse(clf, X, y):
    """Calculate the standard squared error of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The standard squared error of the model.
    """
    y_hat = clf.predict(X)
    sse = np.sum((y_hat - y) ** 2)
    return sse / X.shape[0]


def adj_r2_score(clf, X, y):
    """Calculate the adjusted :math:`R^2` of the model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    float
        The adjusted :math:`R^2` of the model.
    """
    n = X.shape[0]  # Number of observations
    p = X.shape[1]  # Number of features
    r_squared = metrics.r2_score(y, clf.predict(X))
    return 1 - (1 - r_squared) * ((n - 1) / (n - p - 1))


def coef_se(clf, X, y):
    """Calculate standard error for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of standard errors for the beta coefficients.
    """
    n = X.shape[0]
    X1 = np.hstack((np.ones((n, 1)), np.matrix(X)))
    se_matrix = scipy.linalg.sqrtm(
        metrics.mean_squared_error(y, clf.predict(X)) *
        np.linalg.inv(X1.T * X1)
    )
    return np.diagonal(se_matrix)


def coef_tval(clf, X, y):
    """Calculate t-statistic for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of t-statistic values.
    """
    a = np.array(clf.intercept_ / coef_se(clf, X, y)[0])
    b = np.array(clf.coef_ / coef_se(clf, X, y)[1:])
    return np.append(a, b)


def coef_pval(clf, X, y):
    """Calculate p-values for beta coefficients.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    Returns
    -------
    numpy.ndarray
        An array of p-values.
    """
    n = X.shape[0]
    t = coef_tval(clf, X, y)
    p = 2 * (1 - scipy.stats.t.cdf(abs(t), n - 1))
    return p

def summary(clf, X, y, xlabels=None):
    """
    Output summary statistics for a fitted regression model.
    Parameters
    ----------
    clf : sklearn.linear_model
        A scikit-learn linear model classifier with a `predict()` method.
    X : numpy.ndarray
        Training data used to fit the classifier.
    y : numpy.ndarray
        Target training values, of shape = [n_samples].
    xlabels : list, tuple
        The labels for the predictors.
    """
    # Check and/or make xlabels
    ncols = X.shape[1]
    if xlabels is None:
        xlabels = np.array(
            ['x{0}'.format(i) for i in range(1, ncols + 1)], dtype='str')
    elif isinstance(xlabels, (tuple, list)):
        xlabels = np.array(xlabels, dtype='str')
    # Make sure dims of xlabels matches dims of X
    if xlabels.shape[0] != ncols:
        raise AssertionError(
            "Dimension of xlabels {0} does not match "
            "X {1}.".format(xlabels.shape, X.shape))
    # Create data frame of coefficient estimates and associated stats
    coef_df = pd.DataFrame(
        index=['_intercept'] + list(xlabels),
        columns=['Estimate', 'Std. Error', 't value', 'p value']
    )
    try:
        coef_df['Estimate'] = np.concatenate(
            (np.round(np.array([clf.intercept_]), 6), np.round((clf.coef_), 6)))
    except Exception as e:
        coef_df['Estimate'] = np.concatenate(
            (
                np.round(np.array([clf.intercept_]), 6),
                np.round((clf.coef_), 6)
            ), axis = 1
    )[0,:]
    coef_df['Std. Error'] = np.round(coef_se(clf, X, y), 6)
    coef_df['t value'] = np.round(coef_tval(clf, X, y), 4)
    coef_df['p value'] = np.round(coef_pval(clf, X, y), 6)
    # Output results
    print('Coefficients:')
    print(coef_df.to_string(index=True))
    print('---')
    print('R-squared:  {0:.6f},    Adjusted R-squared:  {1:.6f},    MSE: {2:.1f}'.format(
        metrics.r2_score(y, clf.predict(X)), adj_r2_score(clf, X, y), sse(clf, X, y)))

In [7]:
penalties = [0.00001, 0.00005, 0.0001, 0.001, 0.01, 0.1, 0.3, 0.5, 0.6, 0.7, 0.9, 1, 10]

for penalty in penalties:
    model = Lasso(alpha=penalty).fit(X_scaled.iloc[train_idx], Y.iloc[train_idx])
    score = model.score(X_scaled.iloc[valid_idx], Y.iloc[valid_idx]) ## R square
    pred_y = model.predict(X_scaled.iloc[valid_idx])
    mse = mean_squared_error(Y.iloc[valid_idx], pred_y)
    print(f"Alpah : {penalty:.5f}, R2 : {score:.7f}, MSE : {mse:.7f}, RMSE : {np.sqrt(mse):.7f}")

Alpah : 0.00001, R2 : 0.5301656, MSE : 3084.6092464, RMSE : 55.5392586
Alpah : 0.00005, R2 : 0.5301674, MSE : 3084.5972157, RMSE : 55.5391503
Alpah : 0.00010, R2 : 0.5301697, MSE : 3084.5822667, RMSE : 55.5390157
Alpah : 0.00100, R2 : 0.5302081, MSE : 3084.3301897, RMSE : 55.5367463
Alpah : 0.01000, R2 : 0.5304264, MSE : 3082.8971348, RMSE : 55.5238429
Alpah : 0.10000, R2 : 0.5281775, MSE : 3097.6617244, RMSE : 55.6566413
Alpah : 0.30000, R2 : 0.5210059, MSE : 3144.7454151, RMSE : 56.0780297
Alpah : 0.50000, R2 : 0.5131125, MSE : 3196.5679788, RMSE : 56.5381993
Alpah : 0.60000, R2 : 0.5092452, MSE : 3221.9578939, RMSE : 56.7622929
Alpah : 0.70000, R2 : 0.5050718, MSE : 3249.3577616, RMSE : 57.0031382
Alpah : 0.90000, R2 : 0.4958220, MSE : 3310.0854310, RMSE : 57.5333419
Alpah : 1.00000, R2 : 0.4906431, MSE : 3344.0866656, RMSE : 57.8280785
Alpah : 10.00000, R2 : -0.0256145, MSE : 6733.4784067, RMSE : 82.0577748


In [8]:
model_best = Lasso(alpha=0.02).fit(X_scaled.iloc[train_idx], Y.iloc[train_idx])
summary(model_best, X_scaled.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X.columns)

Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept    4.144976  3.802265e+08+3.448469e+00j  0.0000-0.0000j  1.000000
AGE         -13.359132  2.367189e+01+3.358180e-01j -0.5642+0.0080j  0.573515
BMI         127.876677  3.180436e+01+1.855780e-01j  4.0206-0.0235j  0.000097
BP           66.897382  2.810967e+01+3.986340e-01j  2.3794-0.0337j  0.018761
S1         -153.025383  1.636532e+02+3.289490e-01j -0.9351+0.0019j  0.351466
S2          102.155001  1.151253e+02+8.750340e-01j  0.8873-0.0067j  0.376525
S3          -10.583686  7.280668e+01-2.029050e-01j -0.1454-0.0004j  0.884643
S4            9.263867  5.721106e+01+2.310020e-01j  0.1619-0.0007j  0.871614
S5          181.017864  5.026443e+01+1.280330e-01j  3.6013-0.0092j  0.000447
S6           18.390761  3.405064e+01+5.075420e-01j  0.5400-0.0080j  0.590079
SEX_1        20.777166  3.802265e+08+1.747474e+00j  0.0000-0.0000j  1.000000
SEX_2        -0.000000  3.802265e+08+2.824800e-01j -0.0000+0.0

In [9]:
lasso_cv=LassoCV(alphas=penalties, cv=5)
model = lasso_cv.fit(X_scaled.iloc[train_idx], Y.iloc[train_idx])
print(f"Best Alpha : {model.alpha_:.7f}")

Best Alpha : 0.3000000


In [10]:
model_best = Lasso(alpha=model.alpha_).fit(X_scaled.iloc[train_idx], Y.iloc[train_idx])

score = model_best.score(X_scaled.iloc[valid_idx], Y.iloc[valid_idx])
pred_y = model_best.predict(X_scaled.iloc[valid_idx])
mse = mean_squared_error(Y.iloc[valid_idx], pred_y)

print(f"Alpha:{model.alpha_:.7f}, R2:{score:.7f}, MSE:{mse:.7f}, RMSE:{np.sqrt(mse):.7f}")
summary(model_best, X_scaled.iloc[valid_idx], Y.iloc[valid_idx], xlabels=X.columns)

Alpha:0.3000000, R2:0.5210059, MSE:3144.7454151, RMSE:56.0780297
Coefficients:
              Estimate                  Std. Error         t value   p value
_intercept   35.476784  3.840936e+08-0.000000e+00j  0.0000+0.0000j  1.000000
AGE          -3.436299  2.353316e+01-0.000000e+00j -0.1460-0.0000j  0.884129
BMI         127.199861  3.189525e+01-0.000000e+00j  3.9881+0.0000j  0.000110
BP           60.468162  2.792128e+01-0.000000e+00j  2.1657+0.0000j  0.032134
S1           -6.083815  1.648463e+02-0.000000e+00j -0.0369-0.0000j  0.970616
S2           -0.000000  1.161079e+02-0.000000e+00j -0.0000+0.0000j  1.000000
S3          -62.922734  7.335835e+01-0.000000e+00j -0.8577-0.0000j  0.392588
S4            0.000000  5.756817e+01-0.000000e+00j  0.0000+0.0000j  1.000000
S5          124.200563  5.029221e+01-0.000000e+00j  2.4696+0.0000j  0.014804
S6            8.099838  3.395800e+01+0.000000e+00j  0.2385-0.0000j  0.811844
SEX_1        18.108153  3.840936e+08-0.000000e+00j  0.0000+0.0000j  1.0000