<a href="https://colab.research.google.com/github/miraclehimself/Advanced_ML/blob/main/Polynomial_Regression_week3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AML 22/23 S2W3 Webinar

## Preamble: Importing/Setting Up Packages

In [None]:
!pip install --upgrade scikit-learn==1.2.0 --user

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# https://seaborn.pydata.org/tutorial/aesthetics.html
sns.set(
    style='ticks',
    context='talk',
    font_scale=0.8,
    rc={'figure.figsize': (8,6)}
)

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, ParameterGrid

In [None]:
from sklearn.metrics import mean_squared_error
from functools import partial
rmse = partial(mean_squared_error, squared=False)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

In [None]:
from sklearn.preprocessing import PolynomialFeatures

## Inspecting/Loading Data

In [None]:
adv = pd.read_csv(
    'https://raw.githubusercontent.com/gerberl/6G7V0017_2223/main/datasets/Advertising.csv',
    index_col=0
)
adv.head()

Unnamed: 0,TV,radio,newspaper,sales
1,230.1,37.8,69.2,22.1
2,44.5,39.3,45.1,10.4
3,17.2,45.9,69.3,9.3
4,151.5,41.3,58.5,18.5
5,180.8,10.8,58.4,12.9


## Preparing Data for ML

In [None]:
X = adv.drop(columns='sales')
y = adv['sales']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=20230217, test_size=1/4
)

In [None]:
X_train.shape, y_train.shape

((150, 3), (150,))

In [None]:
y_test.shape

(50,)

## A Pipeline for One Polynomial Regression

In [None]:
regr_poly2 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('polynomial', PolynomialFeatures(degree=2, include_bias=False)),
    ('regr', Ridge(alpha=0.1))
]).set_output(transform='pandas')

In [None]:
regr_poly2

In [None]:
regr_poly2.fit(X_train, y_train)

In [None]:
X_test.describe()

Unnamed: 0,TV,radio,newspaper
count,50.0,50.0,50.0
mean,135.054,22.758,33.276
std,88.022022,14.294056,24.37901
min,5.4,0.3,0.3
25%,66.525,8.825,12.95
50%,115.45,23.45,26.0
75%,215.275,34.9,50.275
max,287.6,47.8,114.0


In [None]:
regr_poly2.predict(
    pd.DataFrame(
        np.array([ [135,23,33] ]),
        columns=X_test.columns
    )
)

array([14.07154277])

In [None]:
rmse(
    y_test,
    regr_poly2.predict(X_test)
)

0.5827008231142041

In [None]:
regr_ridge = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('regr', Ridge(alpha=0.1))
]).set_output(transform='pandas')

In [None]:
regr_ridge.fit(X_train, y_train)

In [None]:
rmse(
    y_test,
    regr_ridge.predict(X_test)
)

1.593719727255435

In [None]:
scores = cross_val_score(
    regr_poly2, X_train, y_train, scoring='neg_root_mean_squared_error'
)*-1
scores.mean(), scores.std()

(0.6542407120945619, 0.20483500688120376)

## Grid-Searching Degree and Alpha

In [None]:
np.linspace(1, 5, 5, dtype='int')

array([1, 2, 3, 4, 5])

In [None]:
param_grid = dict(
    regr__alpha=np.logspace(-3, 3, 7),
    polynomial__degree=np.linspace(1, 5, 5, dtype='int')
)

In [None]:
print(regr_poly2)

Pipeline(steps=[('scaler', StandardScaler()),
                ('polynomial', PolynomialFeatures(include_bias=False)),
                ('regr', Ridge(alpha=0.1))])


In [None]:
list(ParameterGrid(param_grid))

In [None]:
grid = GridSearchCV(
    regr_poly2,
    param_grid,
    scoring='neg_root_mean_squared_error',
    return_train_score=True
)

In [None]:
grid.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid.cv_results_)
results.columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_polynomial__degree', 'param_regr__alpha', 'params',
       'split0_test_score', 'split1_test_score', 'split2_test_score',
       'split3_test_score', 'split4_test_score', 'mean_test_score',
       'std_test_score', 'rank_test_score', 'split0_train_score',
       'split1_train_score', 'split2_train_score', 'split3_train_score',
       'split4_train_score', 'mean_train_score', 'std_train_score'],
      dtype='object')

In [None]:
results[[
    'param_polynomial__degree', 'param_regr__alpha',
    'mean_test_score', 'std_test_score', 'mean_train_score', 'std_train_score',
    'rank_test_score'
]].sort_values('rank_test_score')

Unnamed: 0,param_polynomial__degree,param_regr__alpha,mean_test_score,std_test_score,mean_train_score,std_train_score,rank_test_score
14,3,0.001,-0.568737,0.175953,-0.45206,0.043244,1
15,3,0.01,-0.568743,0.176107,-0.452061,0.043244,2
16,3,0.1,-0.569049,0.177652,-0.452184,0.043233,3
17,3,1.0,-0.591656,0.190626,-0.462418,0.042263,4
9,2,0.1,-0.654241,0.204835,-0.612576,0.056953,5
8,2,0.01,-0.654308,0.204271,-0.612562,0.056953,6
7,2,0.001,-0.654316,0.204214,-0.612562,0.056954,7
10,2,1.0,-0.655115,0.209996,-0.613943,0.056856,8
21,4,0.001,-0.763865,0.21823,-0.341363,0.037623,9
22,4,0.01,-0.764888,0.219616,-0.341365,0.037623,10


In [None]:
best_model = grid.best_estimator_

In [None]:
rmse(
    y_test,
    best_model.predict(X_test)
)

0.557783757842344

In [None]:
from sklearn.dummy import DummyRegressor

In [None]:
regr_baseline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('polynomial', PolynomialFeatures(degree=1, include_bias=False)),
    ('regr', Ridge(alpha=0.1))
]).set_output(transform='pandas')

In [None]:
regr_baseline.fit(X_train, y_train)

In [None]:
rmse(
    y_test,
    regr_baseline.predict(X_test)
)

1.593719727255435

In [None]:
regr_dummy = Pipeline(steps=[
    ('regr', DummyRegressor(strategy='median'))
]).set_output(transform='pandas')

In [None]:
regr_dummy.fit(X_train, y_train)

In [None]:
rmse(
    y_test,
    regr_dummy.predict(X_test)
)

4.955330463248642