In [1]:
# %load general_settings.py
import glob
import os
import shutil
import subprocess
import sys
import warnings
from array import array
from collections import defaultdict, namedtuple
from copy import copy
from functools import partial, singledispatch
from itertools import chain, combinations, product
from pathlib import Path
from time import strftime
import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import scipy.stats as stats
import seaborn as sns
import shap
from colorama import Fore, Style
from IPython.core.display import HTML, display_html
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
from sklearn.base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

# Plots colors.
FONT_COLOR = "#4A4B52"
BACKGROUND_COLOR = "#FFFCFA"

# Data Frame color theme.
CELL_HOVER = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #FFFCFA",
}
TEXT_HIGHLIGHT = {
    "selector": "td",
    "props": "color: #4A4B52; font-weight: bold",
}
INDEX_NAMES = {
    "selector": ".index_name",
    "props": "font-weight: normal; background-color: #FFFCFA; color: #4A4B52;",
}
HEADERS = {
    "selector": "th:not(.index_name)",
    "props": "font-weight: normal; background-color: #FFFCFA; color: #4A4B52;",
}
DF_STYLE = (INDEX_NAMES, HEADERS, TEXT_HIGHLIGHT)
DF_CMAP = sns.light_palette("#BAB8B8", as_cmap=True)

HTML_STYLE = """
    <style>
    code {
        background: rgba(42, 53, 125, 0.10) !important;
        border-radius: 4px !important;
    }
    a {
        color: rgba(123, 171, 237, 1.0) !important;
    }
    ol.numbered-list {
    counter-reset: item;
    }
    ol.numbered-list li {
    display: block;
    }
    ol.numbered-list li:before {
    content: counters(item, '.') '. ';
    counter-increment: item;
    }
    </style>
"""

# Utility functions.
def download_from_kaggle(expr, directory=None, /) -> None:
    if directory is None:
        directory = Path("data")
    if not isinstance(directory, Path):
        raise TypeError("The `directory` argument must be `Path` instance!")
    match expr:
        case ["kaggle", _, "download", *args] if args:
            directory.parent.mkdir(parents=True, exist_ok=True)
            filename = args[-1].split("/")[-1] + ".zip"
            if not (directory / filename).is_file():
                subprocess.run(expr)
                shutil.unpack_archive(filename, directory)
                shutil.move(filename, directory)
        case _:
            raise SyntaxError("Invalid expression!")


def get_interpolated_colors(color1, color2, /, num_colors=2):
    """Return `num_colors` interpolated beetwen `color1` and `color2`.
    Arguments need to be HEX."""

    def interpolate(color1, color2, t):
        r1, g1, b1 = int(color1[1:3], 16), int(color1[3:5], 16), int(color1[5:7], 16)
        r2, g2, b2 = int(color2[1:3], 16), int(color2[3:5], 16), int(color2[5:7], 16)
        r = int(r1 + (r2 - r1) * t)
        g = int(g1 + (g2 - g1) * t)
        b = int(b1 + (b2 - b1) * t)
        return f"#{r:02X}{g:02X}{b:02X}"

    return [interpolate(color1, color2, k / (num_colors + 1)) for k in range(1, num_colors + 1)]


def get_pretty_frame(frame, /, gradient=False, formatter=None, precision=3, repr_html=False):
    stylish_frame = frame.style.set_table_styles(DF_STYLE).format(
        formatter=formatter, precision=precision
    )
    if gradient:
        stylish_frame = stylish_frame.background_gradient(DF_CMAP)  # type: ignore
    if repr_html:
        stylish_frame = stylish_frame.set_table_attributes("style='display:inline'")._repr_html_()
    return stylish_frame


def numeric_descr(frame, /):
    return (
        frame.describe(percentiles=(0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99))
        .T.drop("count", axis=1)
        .rename(columns=str.title)
    )


def missing_unique_vals_summary(frame, /):
    missing_vals = frame.isna().sum()
    missing_vals_ratio = missing_vals / len(frame)
    unique_vals = frame.apply(lambda col: len(col.unique()))
    most_freq_count = frame.apply(lambda col: col.value_counts().iloc[0])
    most_freq_val = frame.mode().iloc[:1].T.squeeze()
    unique_ratio = unique_vals / len(frame)
    freq_count_ratio = most_freq_count / len(frame)

    return pd.DataFrame(
        {
            "Dtype": frame.dtypes,
            "MissingValues": missing_vals,
            "MissingValuesRatio": missing_vals_ratio,
            "UniqueValues": unique_vals,
            "UniqueValuesRatio": unique_ratio,
            "MostFreqValue": most_freq_val,
            "MostFreqValueCount": most_freq_count,
            "MostFreqValueCountRatio": freq_count_ratio,
        }
    )


def check_categories_alignment(frame1, frame2, /):
    print(CLR + "The same categories in training and test datasets?\n")
    cat_features = frame2.select_dtypes(include="object").columns.to_list()

    for feature in cat_features:
        frame1_unique = set(frame1[feature].unique())
        frame2_unique = set(frame2[feature].unique())
        same = np.all(frame1_unique == frame2_unique)
        print(CLR + f"{feature:25s}", BLUE + f"{same}")


def get_n_rows_and_axes(n_features, n_cols, /):
    n_rows = int(np.ceil(n_features / n_cols))
    current_col = range(1, n_cols + 1)
    current_row = range(1, n_rows + 1)
    return n_rows, list(product(current_row, current_col))


def get_distributions_figure(feature_names, frame1, frame2, /, **kwargs):
    histnorm = kwargs.get("histnorm", "probability density")
    train_color = kwargs.get("train_color", "blue")
    test_color = kwargs.get("test_color", "red")
    n_cols = kwargs.get("n_cols", 3)
    n_rows, axes = get_n_rows_and_axes(len(feature_names), n_cols)

    fig = make_subplots(
        rows=n_rows,
        cols=n_cols,
        y_title=histnorm.title(),
        horizontal_spacing=kwargs.get("horizontal_spacing", 0.1),
        vertical_spacing=kwargs.get("vertical_spacing", 0.1),
    )
    fig.update_annotations(font_size=kwargs.get("annotations_font_size", 14))

    for frame, color, name in zip((frame1, frame2), (train_color, test_color), ("Train", "Test")):
        if frame is None:  # Test dataset may not exist.
            break

        for k, (var, (row, col)) in enumerate(zip(feature_names, axes), start=1):
            # density, bins = np.histogram(frame[var].dropna(), density=True)
            fig.add_histogram(
                x=frame[var],
                histnorm=histnorm,
                marker_color=color,
                marker_line_width=0,
                opacity=0.75,
                name=name,
                legendgroup=name,
                showlegend=k == 1,
                row=row,
                col=col,
            )
            fig.update_xaxes(title_text=var, row=row, col=col)

    fig.update_xaxes(
        tickfont_size=8, showgrid=False, titlefont_size=8, titlefont_family="Arial Black"
    )
    fig.update_yaxes(tickfont_size=8, showgrid=False)

    fig.update_layout(
        width=840,
        height=kwargs.get("height", 640),
        title=kwargs.get("title", "Distributions"),
        font_color=FONT_COLOR,
        title_font_size=18,
        plot_bgcolor=BACKGROUND_COLOR,
        paper_bgcolor=BACKGROUND_COLOR,
        bargap=kwargs.get("bargap", 0),
        bargroupgap=kwargs.get("bargroupgap", 0),
        legend=dict(yanchor="bottom", xanchor="right", y=1, x=1, orientation="h", title=""),
    )
    return fig


# Html highlight. Must be included at the end of all imports!
HTML(HTML_STYLE)


In [2]:
import numpy as np
from sklearn.datasets import load_iris


iris = load_iris(as_frame=True)

X = iris.data.to_numpy()  # type: ignore
y = iris.target.to_numpy()  # type: ignore

X = np.c_[np.ones(len(X)), X]  # With bias term.


In [3]:
test_ratio = 0.2
valid_ratio = 0.2
size = len(X)
test_size = int(size * test_ratio)
valid_size = int(size * valid_ratio)
train_size = size - test_size - valid_size

np.random.seed(42)
ids = np.random.permutation(size)

X_train = X[ids[:train_size]]
y_train = y[ids[:train_size]]

X_valid = X[ids[train_size:-test_size]]
y_valid = y[ids[train_size:-test_size]]

X_test = X[ids[-test_size:]]
y_test = y[ids[-test_size:]]


In [4]:
def to_one_hot(y):
    return np.diag(np.ones(y.max() + 1))[y]


In [5]:
y_train[:5]


array([1, 0, 2, 1, 1])

In [6]:
to_one_hot(y_train[:5])


array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [7]:
Y_train = to_one_hot(y_train)
Y_valid = to_one_hot(y_valid)
Y_test = to_one_hot(y_test)


In [8]:
mean = X_train[:, 1:].mean(axis=0)  # Without bias term.
std = X_train[:, 1:].mean(axis=0)

X_train[:, 1:] = (X_train[:, 1:] - mean) / std
X_valid[:, 1:] = (X_valid[:, 1:] - mean) / std
X_test[:, 1:] = (X_test[:, 1:] - mean) / std


In [9]:
def softmax(logits):
    exps = np.exp(logits)
    exp_sums = exps.sum(axis=1, keepdims=True)
    return exps / exp_sums


In [10]:
n_inputs = X_train.shape[1]
n_outputs = len(np.unique(y_train))


## Default Batch Gradient Descent

In [11]:
eta = 0.5
eps = 1e-9
n_epochs = 1000
m = len(X_train)

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    errors = softmax(X_train @ Theta) - Y_train
    grads = 1 / m * X_train.T @ errors
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_losses = -(Y_valid * np.log(np.clip(Y_proba_valid, eps, 1 - eps)))
        mean_loss = xentropy_losses.sum(axis=1).mean()
        print(f"Epoch: {epoch:3d} - Valid Loss: {mean_loss}")


Epoch:   0 - Valid Loss: 2.1069489136021304
Epoch: 100 - Valid Loss: 0.30879084881854263
Epoch: 200 - Valid Loss: 0.24408622171429012
Epoch: 300 - Valid Loss: 0.2180997345309118
Epoch: 400 - Valid Loss: 0.203850681806548
Epoch: 500 - Valid Loss: 0.1946614583857917
Epoch: 600 - Valid Loss: 0.18809465830682998
Epoch: 700 - Valid Loss: 0.18305240341590498
Epoch: 800 - Valid Loss: 0.17896927278172073
Epoch: 900 - Valid Loss: 0.17552567687462337


In [12]:
Theta


array([[ 0.47265049,  3.07489532, -2.54140741],
       [ 0.43242377, -0.04647024,  0.668786  ],
       [ 3.33127519, -0.2266624 , -1.22743964],
       [-5.27752905,  0.66376905,  4.2271726 ],
       [-6.85188537, -2.37669375,  5.83234332]])

In [13]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy


0.9333333333333333

## L2 Regularization

In [85]:
eta = 0.5
eps = 1e-9
alpha = 1e-2  # L2 regularization strength.
n_epochs = 1000
m = len(X_train)

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    errors = softmax(X_train @ Theta) - Y_train
    grads = 1 / m * X_train.T @ errors
    grads += np.r_[np.zeros((1, n_outputs)), alpha / m * Theta[1:]]  # Plus L2 term.
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        Y_proba_valid = softmax(X_valid @ Theta)
        xentropy_loss = -(Y_valid * np.log(np.clip(Y_proba_valid, eps, 1 - eps)))
        l2_loss = 2 * alpha / m * (Theta[1:] ** 2).sum()
        total_loss = xentropy_loss.sum(axis=1).mean() + l2_loss
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f}")


Epoch:   0 - Valid Loss: 2.10948
Epoch: 100 - Valid Loss: 0.31671
Epoch: 200 - Valid Loss: 0.25671
Epoch: 300 - Valid Loss: 0.23449
Epoch: 400 - Valid Loss: 0.22341
Epoch: 500 - Valid Loss: 0.21697
Epoch: 600 - Valid Loss: 0.21284
Epoch: 700 - Valid Loss: 0.21000
Epoch: 800 - Valid Loss: 0.20794
Epoch: 900 - Valid Loss: 0.20635


In [86]:
Theta


array([[ 0.47765381,  3.01578941, -2.48730483],
       [ 0.38368129, -0.04079102,  0.65484904],
       [ 3.21336681, -0.24351742, -1.19412241],
       [-5.15268403,  0.64795731,  4.13903126],
       [-6.66748418, -2.29159497,  5.74638269]])

In [87]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy


0.9333333333333333

## Early Stopping

In [101]:
eta = 0.5
eps = 1e-9
alpha = 3e-2  # L2 regularization strength.
n_epochs = 1000
m = len(X_train)
best_loss = np.inf

np.random.seed(42)
Theta = np.random.randn(n_inputs, n_outputs)

for epoch in range(n_epochs):
    Y_train_proba = softmax(X_train @ Theta)
    Y_valid_proba = softmax(X_valid @ Theta)
    xentropy_loss = -(Y_valid * np.log(np.clip(Y_valid_proba, eps, 1 - eps)))
    l2_loss = 2 * alpha / m * (Theta[1:] ** 2).sum()
    total_loss = xentropy_loss.sum(axis=1).mean() + l2_loss

    errors = Y_train_proba - Y_train
    grads = 1 / m * X_train.T @ errors
    grads += np.r_[np.zeros((1, n_outputs)), alpha / m * Theta[1:]]
    Theta = Theta - eta * grads

    if epoch % 100 == 0:
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f}")

    if total_loss < best_loss:
        best_loss = total_loss
    else:
        print(f"Epoch: {epoch:3d} - Valid Loss: {total_loss:.5f} - Early Stopping")
        break


Epoch:   0 - Valid Loss: 2.47371
Epoch: 100 - Valid Loss: 0.33325
Epoch: 200 - Valid Loss: 0.28143
Epoch: 300 - Valid Loss: 0.26592
Epoch: 400 - Valid Loss: 0.26035
Epoch: 500 - Valid Loss: 0.25854
Epoch: 563 - Valid Loss: 0.25832 - Early Stopping


In [102]:
Theta


array([[ 0.37983996,  2.48490524, -1.85860681],
       [ 0.4652836 , -0.11644288,  0.61126296],
       [ 2.82416639, -0.40012038, -0.71530102],
       [-4.31627247,  0.71349348,  3.25087791],
       [-5.68788957, -1.96749677,  4.56387565]])

In [103]:
Y_proba = softmax(X_valid @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_valid == y_pred).mean()
accuracy


0.9333333333333333

In [104]:
Y_proba = softmax(X_test @ Theta)
y_pred = Y_proba.argmax(axis=1)

accuracy = (y_test == y_pred).mean()
accuracy


0.9666666666666667

In [5]:
from sklearn.datasets import fetch_california_housing


In [4]:
# %load tensorflow_imports.py
import tensorflow as tf
import tensorflow_datasets as tfds
from keras import layers
from tensorflow import keras

K = keras.backend
AUTO = tf.data.AUTOTUNE


In [2]:
keras.utils.set_random_seed(42)

X = tf.random.uniform((1, 224, 224, 3), seed=42)


In [4]:
conv = layers.Conv2D(filters=64, kernel_size=5)
conv(X).shape


TensorShape([1, 220, 220, 64])

In [7]:
conv.get_weights()[0].shape


(5, 5, 3, 64)

In [35]:
from functools import partial


In [36]:
DefaultInceptionConv2D = partial(
    layers.Conv2D,
    strides=1,
    padding="same",
    kernel_initializer="he_normal",
)


class InceptionConv2D(layers.Layer):
    def __init__(self, filters, kernel_size=1, **kwargs):
        super().__init__(**kwargs)
        self.conv = [
            DefaultInceptionConv2D(filters=filters, kernel_size=kernel_size),
            layers.BatchNormalization(),
            layers.ReLU(),
        ]

    def call(self, inputs):
        X = inputs
        for layer in self.conv:
            X = layer(X)
        return X


class InceptionModule(layers.Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.concat = layers.Concatenate()
        self.stack1 = [InceptionConv2D(filters=64)]
        self.stack2 = [InceptionConv2D(filters=96), InceptionConv2D(filters=128, kernel_size=3)]
        self.stack3 = [InceptionConv2D(filters=12), InceptionConv2D(filters=32, kernel_size=5)]
        self.stack4 = [
            layers.MaxPool2D(pool_size=3, strides=1, padding="same"),
            layers.BatchNormalization(),
            InceptionConv2D(filters=32),
        ]

    def call(self, inputs):
        X1, X2, X3, X4 = inputs, inputs, inputs, inputs
        for layer in self.stack1:
            X1 = layer(X1)
        for layer in self.stack2:
            X2 = layer(X2)
        for layer in self.stack3:
            X3 = layer(X3)
        for layer in self.stack4:
            X4 = layer(X4)
        return self.concat([X1, X2, X3, X4])


In [38]:
IM = InceptionModule()

print(X.shape)
print(IM(X).shape)


(1, 224, 224, 3)
(1, 224, 224, 256)


In [42]:
layers.GlobalAvgPool2D()(IM(X))


<tf.Tensor: shape=(1, 256), dtype=float32, numpy=
array([[1.04704154e+00, 0.00000000e+00, 1.02873731e+00, 1.08971167e+00,
        3.28178331e-02, 8.06148589e-01, 0.00000000e+00, 1.30096745e+00,
        6.03694953e-02, 7.28407549e-03, 1.54893792e-06, 1.62363723e-02,
        2.32358992e-01, 1.35494880e-02, 2.20537841e-01, 4.39143144e-02,
        4.37835604e-03, 8.11324775e-01, 8.68748873e-02, 4.83216614e-01,
        7.37776518e-01, 1.28926439e-02, 1.38001710e-01, 2.54548155e-03,
        0.00000000e+00, 1.61289394e-01, 5.68381548e-01, 1.74389005e-01,
        1.17599433e-02, 1.58854318e-03, 3.03267734e-03, 1.52783124e-02,
        0.00000000e+00, 0.00000000e+00, 3.46079230e-01, 2.28877202e-01,
        0.00000000e+00, 1.35189676e+00, 4.35059309e-01, 7.71518171e-01,
        4.43152249e-01, 5.97636700e-01, 9.23033506e-02, 1.01532332e-05,
        5.78622147e-03, 0.00000000e+00, 5.74101567e-01, 8.85211170e-01,
        2.45120645e-01, 3.84467281e-02, 8.34716391e-03, 6.47285521e-01,
        0.0000

In [2]:
s = np.arange(10)
s

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
dataset = keras.utils.timeseries_dataset_from_array(
    s,
    targets=s[4:],
    sequence_length=4,
    batch_size=2,
)
list(dataset)

[(<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
  array([[0, 1, 2, 3],
         [1, 2, 3, 4]])>,
  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([4, 5])>),
 (<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
  array([[2, 3, 4, 5],
         [3, 4, 5, 6]])>,
  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([6, 7])>),
 (<tf.Tensor: shape=(2, 4), dtype=int32, numpy=
  array([[4, 5, 6, 7],
         [5, 6, 7, 8]])>,
  <tf.Tensor: shape=(2,), dtype=int32, numpy=array([8, 9])>)]

In [22]:
dataset = tf.data.Dataset.range(6).window(4, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(4))

for element in dataset:
    print(f"{element}")


[0 1 2 3]
[1 2 3 4]
[2 3 4 5]


In [23]:
def to_windows(dataset, length):
    dataset = dataset.window(length, shift=1, drop_remainder=True)
    return dataset.flat_map(lambda window: window.batch(length))


In [26]:
dataset = to_windows(tf.data.Dataset.range(6), 4)
dataset = dataset.map(lambda window: (window[:-1], window[-1]))

list(dataset.batch(2))

[(<tf.Tensor: shape=(2, 3), dtype=int64, numpy=
  array([[0, 1, 2],
         [1, 2, 3]], dtype=int64)>,
  <tf.Tensor: shape=(2,), dtype=int64, numpy=array([3, 4], dtype=int64)>),
 (<tf.Tensor: shape=(1, 3), dtype=int64, numpy=array([[2, 3, 4]], dtype=int64)>,
  <tf.Tensor: shape=(1,), dtype=int64, numpy=array([5], dtype=int64)>)]

In [37]:
dataset = to_windows(tf.data.Dataset.range(7), 3)
list(dataset)

[<tf.Tensor: shape=(3,), dtype=int64, numpy=array([0, 1, 2], dtype=int64)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([1, 2, 3], dtype=int64)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 3, 4], dtype=int64)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([3, 4, 5], dtype=int64)>,
 <tf.Tensor: shape=(3,), dtype=int64, numpy=array([4, 5, 6], dtype=int64)>]

In [38]:
dataset = to_windows(dataset, 4)
list(dataset)

[<tf.Tensor: shape=(4, 3), dtype=int64, numpy=
 array([[0, 1, 2],
        [1, 2, 3],
        [2, 3, 4],
        [3, 4, 5]], dtype=int64)>,
 <tf.Tensor: shape=(4, 3), dtype=int64, numpy=
 array([[1, 2, 3],
        [2, 3, 4],
        [3, 4, 5],
        [4, 5, 6]], dtype=int64)>]

In [39]:
dataset = dataset.map(lambda s: (s[:, 0], s[:, 1:]))
list(dataset)

[(<tf.Tensor: shape=(4,), dtype=int64, numpy=array([0, 1, 2, 3], dtype=int64)>,
  <tf.Tensor: shape=(4, 2), dtype=int64, numpy=
  array([[1, 2],
         [2, 3],
         [3, 4],
         [4, 5]], dtype=int64)>),
 (<tf.Tensor: shape=(4,), dtype=int64, numpy=array([1, 2, 3, 4], dtype=int64)>,
  <tf.Tensor: shape=(4, 2), dtype=int64, numpy=
  array([[2, 3],
         [3, 4],
         [4, 5],
         [5, 6]], dtype=int64)>)]