In [12]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
import numpy as np

# Loading Built-In Set

- Loading `diabetes` dataset

In [13]:
diabetes: np.ndarray = load_diabetes()

- Splitting data into training set and test set

In [14]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, random_state=55)

In [15]:
# compute R^2 for the Lasso model using default parameters
from sklearn.linear_model import Lasso
lasso = Lasso().fit(X_train, y_train)
print(f"Training set score: {lasso.score(X_train, y_train)}")
print(f"Test set score: {lasso.score(X_test, y_test)}")

Training set score: 0.3355506896161845
Test set score: 0.37969961797249985


In [16]:
# number of feature this model uses
print(f"Number of features used: {np.sum(lasso.coef_ != 0)}")

Number of features used: 3


In [20]:
# names of features
print(f"Features used: {diabetes.feature_names}")

Features used: ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


# Loading Diabetes from File

- Importing data from file 
- Each sample is separated by `	` hence it is set as the `delimiter`
- The header (first row) is skipped as it is not data

In [38]:
diabetes = np.genfromtxt('diabetes.data', delimiter='	', skip_header=1)

In [39]:
print(diabetes)

[[ 59.       2.      32.1    ...   4.8598  87.     151.    ]
 [ 48.       1.      21.6    ...   3.8918  69.      75.    ]
 [ 72.       2.      30.5    ...   4.6728  85.     141.    ]
 ...
 [ 60.       2.      24.9    ...   4.1271  95.     132.    ]
 [ 36.       1.      30.     ...   5.1299  85.     220.    ]
 [ 36.       1.      19.6    ...   4.5951  92.      57.    ]]


- Separating features `X` and labels `y`

In [41]:
X = diabetes[:, :-1]
y = diabetes[:, -1]

- Splitting data into training sets and test sets

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=55)

## Lasso

In [47]:
# compute R^2 for the Lasso model using default parameters
lasso = Lasso().fit(X_train, y_train)

In [48]:
print(f"Training set score: {lasso.score(X_train, y_train)}")
print(f"Test set score: {lasso.score(X_test, y_test)}")

Training set score: 0.4990579290275038
Test set score: 0.5300107053694756


## StandardScalar

In [49]:
# process data using StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit(X_train)
X_test_scaled = scaler.fit(X_test)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [50]:
print(f"Training set score: {lasso.score(X_train_scaled, y_train)}")
print(f"Test set score: {lasso.score(X_test_scaled, y_test)}")

Training set score: -21.557637682219355
Test set score: -20.099274177977286
