In [1]:
import numpy as np

# PA

## Q1

In [2]:
def compute_explained_variance(y_true, y_pred):
    var_y_true = np.var(y_true)
    var_y_diff = np.var(y_true - y_pred)
    return 1 - (var_y_diff/var_y_true)


y_true = np.array([7, 4, 9, 4])
y_pred = np.array([8, 7, 12, 5])
compute_explained_variance(y_true, y_pred)

0.7777777777777778

## Q2

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import validation_curve

from sklearn.pipeline import Pipeline

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

sgd_pipeline = Pipeline([
    ('scaling', StandardScaler(with_mean=True, with_std=True)),
    ('SGD_reg', SGDRegressor(loss='squared_error', penalty='l1', alpha=0.01, random_state=1))
])

sgd_pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaling', StandardScaler()),
                ('SGD_reg',
                 SGDRegressor(alpha=0.01, penalty='l1', random_state=1))])

In [4]:
sgd_pipeline.score(X_test, y_test)

0.5951040704728554

# GA

## Q1

In [5]:
from sklearn.model_selection import RepeatedKFold

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])

def k_fold_cv(X):
    rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=1)
    return rkf.split(X)

for train_index, test_index in k_fold_cv(X):
     print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [0 1] TEST: [2 3]
TRAIN: [2 3] TEST: [0 1]
TRAIN: [1 3] TEST: [0 2]
TRAIN: [0 2] TEST: [1 3]


## Q2, 3, 4

In [19]:
from sklearn.model_selection import GridSearchCV

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

sgd_grid_pipeline = Pipeline([
    ('scaling', StandardScaler(with_mean=True, with_std=True)),
    ('SGD', SGDRegressor(random_state=1))
])

param_grid ={
    'SGD__loss' : ('squared_error', 'huber'),
    'SGD__penalty': ('l1', 'l2'),
    'SGD__alpha': (0.1, 0.01, 0.001), 
    'SGD__max_iter': (1000, 2000, 5000),
}

sgd_grid_search = GridSearchCV(
    sgd_grid_pipeline,
    param_grid=param_grid,
    n_jobs=2, 
    cv=4
)

sgd_grid_search.fit(X_train, y_train)

GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('scaling', StandardScaler()),
                                       ('SGD', SGDRegressor(random_state=1))]),
             n_jobs=2,
             param_grid={'SGD__alpha': (0.1, 0.01, 0.001),
                         'SGD__loss': ('squared_error', 'huber'),
                         'SGD__max_iter': (1000, 2000, 5000),
                         'SGD__penalty': ('l1', 'l2')})

### Q3, 4

In [20]:
sgd_grid_search.best_params_

{'SGD__alpha': 0.01,
 'SGD__loss': 'squared_error',
 'SGD__max_iter': 1000,
 'SGD__penalty': 'l1'}

### Q2

In [21]:
loss = sgd_grid_search.best_params_['SGD__loss']
penalty = sgd_grid_search.best_params_['SGD__penalty']
alpha = sgd_grid_search.best_params_['SGD__alpha']
max_iter = sgd_grid_search.best_params_['SGD__max_iter']


sgd_pipeline = Pipeline([
    ('scaling', StandardScaler(with_mean=True, with_std=True)),
    ('SGD_reg', SGDRegressor(random_state=1, loss=loss, penalty=penalty, max_iter=max_iter, alpha=alpha, ))
])

sgd_pipeline.fit(X_train, y_train)

Pipeline(steps=[('scaling', StandardScaler()),
                ('SGD_reg',
                 SGDRegressor(alpha=0.01, penalty='l1', random_state=1))])

In [22]:
sgd_pipeline.score(X_test, y_test)

0.5951040704728554

## Q5, 6

In [24]:
from sklearn.linear_model import Ridge

X, y = fetch_california_housing(as_frame=True, return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

ridge_grid_pipeline = Pipeline([
    ('scaling', StandardScaler(with_mean=True, with_std=True)),
    ('ridge', Ridge(random_state=1))
])

param_grid ={
    'ridge__fit_intercept' : (True, False),
    'ridge__penalty': ('l1', 'l2'),
    'ridge__alpha': (0.5,0.1,0.05,0.01,0.005,0.001), 
    'ridge__max_iter': (1000, 2000, 5000),
}

ridge_grid_search = GridSearchCV(
    ridge_grid_pipeline,
    param_grid=param_grid,
    n_jobs=2, 
    cv=4
)

ridge_grid_search.fit(X_train, y_train)

ValueError: Invalid parameter loss for estimator Ridge(alpha=0.5, random_state=1). Check the list of available parameters with `estimator.get_params().keys()`.

In [25]:
?Ridge