In [38]:
from datasets import diabetes_data

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA
from pygam import LinearGAM, s, f

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from tools import polynomial_features

# Data preparation

In [25]:
original_X, original_y, train_X, train_y, test_X, test_y = diabetes_data()

## Polynomial features

In [26]:
original_X_2 = polynomial_features(original_X, 2)
train_X_2 = polynomial_features(train_X, 2)
test_X_2 = polynomial_features(test_X, 2)

scaler = StandardScaler().fit(original_X_2)

original_X_2 = scaler.transform(original_X_2)
train_X_2 = scaler.transform(train_X_2)
test_X_2 = scaler.transform(test_X_2)

## Feature subset

In [27]:
feature_subset = ['sex', 'bmi', 'bp', 's1', 's3', 's5']

## PCA

In [28]:
pca = PCA(n_components=6)
pca = pca.fit(original_X)

transformed_train_X = pca.transform(train_X)
transformed_test_X = pca.transform(test_X)

# Models

In [43]:
experiments = {
    "LinearRegression": {
        "model": LinearRegression(),
        "train_X": train_X,
        "test_X": test_X,
    },
    "PolynomialRegression": {
        "model": LinearRegression(),
        "train_X": train_X_2,
        "test_X": test_X_2,
    },
    "KNNRegression": {
        "model": KNeighborsRegressor(n_neighbors=19),
        "train_X": train_X,
        "test_X": test_X,
    },
    "SubsetLinearRegression": {
        "model": LinearRegression(),
        "train_X": train_X[feature_subset],
        "test_X": test_X[feature_subset],
    },
    "RidgeRegression": {
        "model": Ridge(alpha=0.17),
        "train_X": train_X,
        "test_X": test_X,
    },
    "LassoRegression": {
        "model": Lasso(alpha=0.08),
        "train_X": train_X,
        "test_X": test_X,
    },
    "PCARegression": {
        "model": LinearRegression(),
        "train_X": transformed_train_X,
        "test_X": transformed_test_X,
    },
    "PartialLeastSquares": {
        "model": PLSRegression(n_components=3, scale=False),
        "train_X": train_X,
        "test_X": test_X,
    },
    "GeneralAdditiveModel": {
        "model": LinearGAM(
            s(0) + s(1) + s(2) + s(3) + s(4) + s(5) + s(6) + s(7) + s(8) + s(9)
        ),
        "train_X": train_X,
        "test_X": test_X,
    },
}

In [44]:
columns = ['train_mse', 'test_mse', 'cross_val_r2']
model_names = []
results = []

for model_name, experiment in experiments.items():
    train_X = experiment['train_X']
    test_X = experiment['test_X']
    
    model = experiment['model'].fit(train_X, train_y)

    train_error = mean_squared_error(train_y, model.predict(train_X))
    test_error = mean_squared_error(test_y, model.predict(test_X))

    try:
        cross_val = cross_val_score(model, train_X, train_y).mean()
    except:
        cross_val = np.NaN

    model_names.append(model_name)
    results.append([train_error, test_error, cross_val])

result_df = pd.DataFrame(results, columns = columns, index=model_names)

In [45]:
result_df.sort_values('test_mse')

Unnamed: 0,train_mse,test_mse,cross_val_r2
PartialLeastSquares,2880.38381,2944.668885,0.450786
LassoRegression,2890.518725,2945.684103,0.44961
PolynomialRegression,2674.453037,2951.387782,0.455072
LinearRegression,2859.85815,2972.251895,0.449792
RidgeRegression,2931.753568,2976.831589,0.44528
SubsetLinearRegression,2880.171026,2990.036674,0.456047
PCARegression,2911.425866,2997.235609,0.450161
KNNRegression,2893.306152,3391.859163,0.414914
GeneralAdditiveModel,2017.605344,3585.22038,
