## Code for figures

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import (
    StandardScaler,
    Normalizer,
    FunctionTransformer,
    PowerTransformer,
    OneHotEncoder,
    OrdinalEncoder,
)
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import make_column_transformer
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [None]:
rng = np.random.default_rng(seed=123456)
x = rng.normal(5, 2, 1000)

fig, axes = plt.subplots(1, 3, figsize=(12, 4))
axes[0].hist(x, bins=50)
axes[0].set_title("Original data")
axes[1].hist((x - x.mean()) / x.std(), bins=50)
axes[1].set_title("Standardized")
axes[2].hist((x - x.min()) / (x.max() - x.min()), bins=50)
axes[2].set_title("Normalized")

plt.savefig("../../static/img/05-scaling.png")

## Fake data to demonstrate scaling and nonlinear transforms

In [None]:
# generate some fake count data
# From Intro to Machine Learning with Python
X_org = rng.normal(size=(1000, 3))
X = rng.poisson(10 * np.exp(X_org))

plt.hist(X[:, 1], bins=50)

plt.xlabel("Times per day checking D2L")
plt.ylabel("Number of days")

plt.savefig("../../static/img/05-counts.png")

## Side comment on the central limit theorem
Sums of independent and identically distributed random variables converge to normal as number of samples increases.

As soon as one of those criteria are missing, you can't count on it anymore!

In [None]:
# CLT version
X_c = np.zeros(1000)
for i in range(len(X)):
    X_c[i] = rng.poisson(X[:, 1].mean(), size=50).mean()

plt.hist(X_c, bins=50)

plt.xlabel("Average times per day checking D2L")
plt.ylabel("Number of days")

## Back to the fake data
Create a fake (scalar) output using the "counts" data X (which is actually a matrix of 3 features) as input.

Note that the data generation doesn't actually use the count data directly, but instead is a linearly weighted combination of the normally distributed random samples that were used as parameters in the poisson distribution.

In [None]:
# Generate some more fake features and a fake X
# Repeated runs of the same cell results in different values unless we re-seed
rng = np.random.default_rng(seed=42)
w = rng.normal(size=3)
y = X_org.dot(w)

# the usual split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
axes[0].set_ylabel("$y$")
for i in range(3):
    axes[i].scatter(X_train[:, i], y_train, alpha=0.3)
    axes[i].set_xlabel(f"$x_{i}$")

In [None]:
# train a regression model on the raw data using stochastic gradient descent
model = SGDRegressor()
cross_val_score(model, X_train, y_train)

In [None]:
# Add on the preprocessing pipeline
pipeline = make_pipeline(
    # FunctionTransformer(np.log1p), # log + 1
    StandardScaler(),
    model,
)

cross_val_score(pipeline, X_train, y_train)

In [None]:
pipeline.fit(X_train, y_train)
y_est = pipeline.predict(X_train)
plt.scatter(y_est, y_train)
plt.xlabel(r"Predicted $\hat{y}$")
plt.ylabel("True $y$")
plt.text(2, -3, f"MSE = {np.mean((y_train - y_est)**2):.2f}")

## Mixed data types: Categorical and numeric


In [None]:
# Revisit the OKCupid data
df = pd.read_csv("../04_categorical/profiles_revised.csv")

In [None]:
# Target: job is "other" or "rather not say"
df["job"].value_counts()
df["mystery_job"] = df["job"].apply(lambda j: j in ["other", "rather not say"])

In [None]:
df.info()

In [None]:
# select features to use in the model
numeric_features = ["age", "height", "income"]
cat_features = ["drinks", "education", "sex"]

# For now, drop features with missing values
# We'll look at imputation later
df.dropna(subset=numeric_features + cat_features, inplace=True)

X = df[numeric_features + cat_features]
y = df["mystery_job"].astype(float)

# split!
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=df["mystery_job"], random_state=12345
)

# count how many in each class
print(y_train.value_counts())
print(y_test.value_counts())

In [None]:
# Define the trickier encoders
drink_enc = OrdinalEncoder(
            categories=[
                [
                    "not at all",
                    "rarely",
                    "socially",
                    "often",
                    "very often",
                    "desperately",
                ]
            ],
            handle_unknown="use_encoded_value",
            unknown_value=-1,
        )

# This took a while mucking around to figure out the right magic between df/series
def split_edu(df):
    df["education"] = df["education"].str.split(" ")
    return df["education"]

edu_enc = make_pipeline(
        FunctionTransformer(split_edu, validate=False),
        FeatureHasher(n_features=16, input_type="string"),
    )


In [None]:
# Build the preprocessing pipeline
preprocessor = make_column_transformer(
    (PowerTransformer(method="yeo-johnson"), numeric_features),
    (OneHotEncoder(handle_unknown="ignore"), ["sex"]),
    (drink_enc, ["drinks"]),
    (edu_enc, ["education"]),
)#.set_output(transform="pandas")
preprocessor

In [None]:
X_pro = preprocessor.fit_transform(X_train)

In [None]:
# only works with set_output(transform="pandas")
# pd.plotting.scatter_matrix(X_pro, figsize=(10, 10))

In [None]:
# Now add on a model!
pipeline = make_pipeline(
    preprocessor,
    SGDClassifier(),
)

cross_val_score(pipeline, X_train, y_train)