In [None]:
import numpy as np

# Load the data into pandas

In [None]:
from sklearn import datasets
df = datasets.load_diabetes(as_frame=True)
df = df['frame']
df

# Pandas indexing demo

### Square brackets []

### .loc[]

# Extract inputs and outputs

In [None]:
features = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X = df[features].values
y = df['target'].values

# Train test split

In [None]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=0.2, random_state=7265)

## Seaborn plots
`pip install seaborn`

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Correlation pair plot

In [None]:
sns.pairplot(df)

### Correlation heat map

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(abs(df.corr()), annot=True, cmap='gray')

# Linear regression

In [None]:
from sklearn.linear_model import LinearRegression 

model = LinearRegression()
model.fit(Xtrain, ytrain)

model.intercept_, model.coef_

# Test

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

ypred = model.predict(Xtest)

MSE = mean_squared_error(ytest, ypred)
R2 = r2_score(ytest, ypred)

R2, MSE

# Linear regression with normalized data and a pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('scaler', StandardScaler()), 
    ('linreg', LinearRegression())
])

pipeline.fit(Xtrain, ytrain)

### Test the piple

In [None]:
ypred = pipeline.predict(Xtest)
R2 = r2_score(ytest, ypred)
MSE = mean_squared_error(ytest, ypred)

R2, MSE

### Extract pipeline parameters

In [None]:
scaler = pipeline.named_steps['scaler']

scaler.mean_, scaler.scale_

In [None]:
linreg = pipeline.named_steps['linreg']

linreg.intercept_, linreg.coef_

# Linear regression with `statsmodels`
`pip install statsmodels`

In [None]:
import statsmodels.api as sm

smmodel = sm.OLS(ytrain, sm.add_constant(Xtrain) ).fit()

In [None]:
smmodel.summary()

In [None]:
ypred = smmodel.predict( sm.add_constant(Xtest))

R2 = r2_score(ytest, ypred)
MSE = mean_squared_error(ytest, ypred)

R2, MSE

# Regularization

### LASSO

In [None]:
from sklearn.linear_model import Lasso

n = 100
coef = np.empty((n,11))
test_MSE = np.empty(n)
lmbdas = np.logspace(-2,0.5,n)

for i, lmbda in enumerate(lmbdas):
    model = Lasso(alpha=lmbda).fit(Xtrain,ytrain)
    ypred = model.predict(Xtest)
    test_MSE[i] = mean_squared_error(ytest, ypred)
    coef[i,0] = model.intercept_
    coef[i,1:]= model.coef_

In [None]:
fig, ax = plt.subplots(figsize=(10,8),ncols=1,nrows=2,sharex=True)

ax[0].semilogx(lmbdas,coef[:,1:],linewidth=2)
ax[0].grid()
ax[0].legend(features,loc=(1.04,-0.5),fontsize=18)
ax[1].semilogx(lmbdas, test_MSE,linewidth=2)
ax[1].grid()
ax[1].set_ylabel('MSE',fontsize=20)
ax[1].set_xlabel('$\lambda$',fontsize=24)
ax[1].set_ylim(3000,5000)

### Ridge regression

In [None]:
from sklearn.linear_model import Ridge

n = 100
coef = np.empty((n,11))
test_MSE = np.empty(n)
lmbdas = np.logspace(-3,1,n)

for i, lmbda in enumerate(lmbdas):
    model = Ridge(alpha=lmbda).fit(Xtrain,ytrain)
    ypred = model.predict(Xtest)
    test_MSE[i] = mean_squared_error(ytest, ypred)
    coef[i,0] = model.intercept_
    coef[i,1:]= model.coef_

In [None]:
fig, ax = plt.subplots(figsize=(10,8),ncols=1,nrows=2,sharex=True)

ax[0].semilogx(lmbdas,coef[:,1:],linewidth=2)
ax[0].grid()
ax[0].legend(features,loc=(1.04,-0.5),fontsize=18)
ax[1].semilogx(lmbdas, test_MSE,linewidth=2)
ax[1].grid()
ax[1].set_ylabel('MSE',fontsize=20)
ax[1].set_xlabel('$\lambda$',fontsize=24)
ax[1].set_ylim(3000,4000)