## Code for figures

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, Normalizer, FunctionTransformer, PowerTransformer
from sklearn.linear_model import SGDRegressor, Ridge
from sklearn.model_selection import train_test_split, cross_val_score


In [None]:
rng = np.random.default_rng(seed=123456)
x = rng.normal(5, 2, 1000)

fig, axes = plt.subplots(1, 3, figsize=(12,4))
axes[0].hist(x, bins=50)
axes[0].set_title("Original data")
axes[1].hist((x - x.mean()) / x.std(), bins=50)
axes[1].set_title("Standardized")
axes[2].hist((x - x.min())/(x.max() - x.min()), bins=50)
axes[2].set_title("Normalized")

plt.savefig("../../static/img/05-scaling.png")

## Fake data to demonstrate scaling and nonlinear transforms

In [None]:
# generate some fake count data
# From Intro to Machine Learning with Python
X_org = rng.normal(size=(1000, 3))
X = rng.poisson(10 * np.exp(X_org))

plt.hist(X[:,1], bins=50)

plt.xlabel("Times per day checking D2L")
plt.ylabel("Number of days")

plt.savefig("../../static/img/05-counts.png")

In [None]:
# Generate some more fake features and a fake X
w = rng.normal(size=3)
y = X_org.dot(w)

plt.scatter(X[:,1], y, alpha=0.3)

In [None]:
# the usual split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# train a model
model = SGDRegressor()
cross_val_score(model, X, y)

In [None]:
# Add on the preprocessing pipeline
pipeline = make_pipeline(
    # FunctionTransformer(np.log1p), # log + 1 
    StandardScaler(),
    model
)

cross_val_score(pipeline, X, y)