# Neural Networks

Нейронная сеть - система из "нейронов":

<img src="https://turingbotsoftware.com/blog/wp-content/uploads/2020/08/1024px-Neural_network.svg_-1.png" alt="drawing" width="500"/>

Таким образом работает скрытый нейрон:

<img src="https://nishantmunjal.com/wp-content/uploads/2024/07/activation-fun.webp" alt="drawing" width="500"/>

То есть математическое выражение вида:

$$
y = f\left(b+\sum_{i=1}^{n}x_i\cdot w_i\right)=f\left(\mathbf{w}^T\mathbf{x}+b\right)
$$

Учить нейронную сеть - прогонять через неё учебную выборку (всю или только часть), сравнивать полученные значения классов с исходными метками классов (функцию потерь $L$), а после, на основе вычисленного отклонения, обновлять веса нейронной сети.

Обновление весов происходит с использованием backpropagation (здесь $\delta$ - ошибка со слоя, который при прямом проходе идёт за данным, для поледнего слоя это будет значение функции потерь):

$$
\begin{aligned}
 w_i
 && = &&&w_i + \delta\cdot\frac{\partial y}{\partial w_i} &= \\
 && = &&&w_i + \delta\cdot\frac{\partial f(\Sigma(w_1, \dots, w_n, x_1,\dots, x_n, b))}{\partial w_i}&=\\
 && = &&&w_i + \delta\cdot\frac{df}{d\Sigma}\cdot\frac{\partial \Sigma}{\partial w_i}&=\\
 && = &&&w_i + \delta\cdot\frac{df}{d\Sigma}\cdot x_i
\end{aligned}
$$

В матричной записи:


$$
\begin{aligned}
 \mathbf{w}
 && = &&&\mathbf{w} + \delta\cdot\frac{\partial y}{\partial \mathbf{w}} &= \\
 && = &&&\mathbf{w} + \delta\cdot\frac{\partial f(\Sigma(\mathbf{w}, \mathbf{x}, b))}{\partial \mathbf{w}}&=\\
 && = &&&\mathbf{w} + \delta\cdot\frac{df}{d\Sigma}\cdot\frac{\partial \Sigma}{\partial \mathbf{w}}&=\\
 && = &&&\mathbf{w} + \delta\cdot\frac{df}{d\Sigma}\cdot \frac{d(\mathbf{w}^T\mathbf{x}+b)}{\mathbf{w}}&=\\
 && = &&&\mathbf{w} + \delta\cdot\frac{df}{d\Sigma}\cdot
 \left(
  \frac{d\mathbf{w}^T}{d\mathbf{w}}\mathbf{x} +
  \mathbf{w}^T\frac{d\mathbf{x}}{d\mathbf{w}} +
  \frac{db}{d\mathbf{w}}
 \right)\\
\end{aligned}
$$

Вспоминая матричное дифференцирование

$$
\frac{da}{d\mathbf{x}}=
\begin{bmatrix} 
    \frac{da}{dx_{1}} \\
    \frac{da}{dx_{2}} \\
    \vdots            \\
    \frac{da}{dx_{3}}
\end{bmatrix}
\qquad
\frac{d\mathbf{x}}{d\mathbf{y}}=
\begin{bmatrix} 
    \frac{dx_{1}}{dy_{1}} & \frac{dx_{1}}{dy_{2}} & \dots  & \frac{dx_{1}}{dy_{n}} \\
    \frac{dx_{2}}{dy_{1}} & \frac{dx_{2}}{dy_{2}} & \dots  & \frac{dx_{2}}{dy_{n}} \\
    \vdots                & \vdots                & \ddots & \vdots \\
    \frac{dx_{n}}{dy_{1}} & \frac{dx_{n}}{dy_{2}} & \dots  & \frac{dx_{n}}{dy_{n}}
\end{bmatrix}
\qquad
\frac{d\mathbf{x}^T}{d\mathbf{x}}=\frac{d\mathbf{x}}{d\mathbf{x}}=I
$$

Получаем, что

$$
\mathbf{w} = \mathbf{w} + \delta\cdot\frac{df}{d\Sigma}\cdot
 \left(I\mathbf{x} + \mathbf{w}^T\mathbb{0} + \mathbf{0}\right)
 =\mathbf{w} + \delta\cdot\frac{df}{d\Sigma}\cdot\mathbf{x}
$$

Аналогично для bias:

$$
b = b + \delta\cdot\frac{\partial y}{\partial b} = b + \delta\cdot\frac{\partial f(\Sigma(\mathbf{w}, \mathbf{x},b))}{\partial b}=b + \delta\cdot\frac{df}{d\Sigma}\cdot\frac{\partial \Sigma}{\partial b}=b + \delta\cdot\frac{df}{d\Sigma}
$$

Если хотим считать сразу для нескольких нейронов и/или нескольких $x$, то в выражениях получим тензоры, а не матрицы

### Data preparation

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import functools
import itertools
import typing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import f1_score

In [None]:
X_train = pd.read_csv("./car_accidents/X_train.csv")
y_train = pd.read_csv("./car_accidents/y_train.csv")

In [None]:
useless_x_columns = [
    "Id",
    "accident_index",
    "generic_make_model"
]
useless_y_columns = [
    "Id"
]

X_train.drop(useless_x_columns, axis=1, inplace=True)
y_train.drop(useless_y_columns, axis=1, inplace=True)

In [None]:
def construct_missing_values_frame(
    frame:         pd.DataFrame,
    digits:        int,
    drop_leq_than: float = 0.0,
    eps:           float = 1e-6
) -> pd.DataFrame:
    n_rows = frame.shape[0]

    missing_values_count = (frame == -1).sum().sort_values(ascending=False)
    missing_values_percent = missing_values_count.apply(lambda c: round(c / n_rows * 100, digits))
    
    almost_filled_columns = missing_values_percent[missing_values_percent >= (drop_leq_than + eps)].index
    
    almost_filled_columns_count = missing_values_count[almost_filled_columns]
    almost_filled_columns_percent = missing_values_percent[almost_filled_columns]
    
    missing_values_frame = pd.concat([almost_filled_columns_count, almost_filled_columns_percent], axis=1)
    missing_values_frame = missing_values_frame.rename(columns={0: "missing_count", 1: "missing_%"})
    
    return missing_values_frame

In [None]:
construct_missing_values_frame(X_train, 1, 0.0, 1e-6)

In [None]:
sparse_columns = [
    "junction_control",
    "second_road_number",
    "engine_capacity_cc",
    "age_of_vehicle",
    "casualty_distance_banding",
    "casualty_home_area_type"
]
X_train.drop(sparse_columns, axis=1, inplace=True)

In [None]:
construct_missing_values_frame(X_train, 1, 0.0, 1e-6)

In [None]:
def drop_missing_values_rows(
    x: pd.DataFrame,
    y: pd.DataFrame
) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    missing_values_rows_masks = [None] * x.shape[1]
    for i, col in enumerate(x.columns):
        missing_values_rows_masks[i] = (x[col] == -1)
    missing_values_rows_mask = functools.reduce(lambda x1, x2: x1 | x2, missing_values_rows_masks)
    x_updated = x[~missing_values_rows_mask].reset_index(drop=True)
    y_updated = y[~missing_values_rows_mask].reset_index(drop=True)
    return (x_updated, y_updated)

In [None]:
X_train, y_train = drop_missing_values_rows(X_train, y_train)

In [None]:
construct_missing_values_frame(X_train, 1, 0.0, 1e-6)

In [None]:
text_columns = [
    "local_authority_highway"
]

le = LabelEncoder()
for text_column in text_columns:
    X_train[text_column] = le.fit_transform(X_train[text_column])

In [None]:
def find_correlating_columns(
    frame:     pd.DataFrame,
    threshold: float,
    eps:       float = 1e-6
) -> pd.DataFrame:
    corr_pairs = X_train.corr().abs().unstack().sort_values(ascending=False)
    self_corr_pairs_mask = (corr_pairs > 1 - eps) & (corr_pairs < 1 + eps)
    high_corr_pairs_mask = corr_pairs >= threshold + eps
    high_corr_pairs = [sorted(pair) for pair in corr_pairs[~self_corr_pairs_mask & high_corr_pairs_mask].index]
    high_corr_pairs = list(a for a, _ in itertools.groupby(high_corr_pairs))
    high_corr_df = corr_pairs[high_corr_pairs].to_frame(name="correlation")
    high_corr_df = high_corr_df.reset_index().rename(columns={"level_0": "column_1", "level_1": "column_2"})
    return high_corr_df

In [None]:
find_correlating_columns(X_train, 0.8)

In [None]:
correlating_columns = [
    "age_band_of_casualty",
    "pedestrian_movement"
]
X_train.drop(correlating_columns, axis=1, inplace=True)

### Naive Bayes

In [None]:
from sklearn.naive_bayes import CategoricalNB

In [None]:
numeric_columns = [
    "age_of_casualty"
]

X_train_nb = X_train.drop(numeric_columns, axis=1)
y_train_nb = y_train.copy()

In [None]:
clf = CategoricalNB()

In [None]:
def stratified_cv(
    x:       pd.DataFrame,
    y:       pd.DataFrame,
    k_folds: int
) -> np.ndarray:
    stratified_x = x.copy()
    stratified_y = y.copy()
    for col in stratified_x.columns:
        vc = stratified_x[col].value_counts()
        small_grouped_values = (vc[vc < k_folds]).index
        small_grouped_rows_mask = stratified_x[col].isin(small_grouped_values)
        stratified_x = stratified_x[~small_grouped_rows_mask]
        stratified_y = stratified_y[~small_grouped_rows_mask]
    stratified_x = stratified_x.reset_index(drop=True)
    stratified_y = stratified_y.reset_index(drop=True)
    
    return cross_val_score(
        clf,
        stratified_x,
        stratified_y.values.flatten(),
        cv=k_folds,
        scoring='f1_macro')

In [None]:
stratified_cv(X_train_nb, y_train_nb, 5)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
X_train_rf = X_train.copy()
y_train_rf = y_train.values.flatten()

In [None]:
clf = RandomForestClassifier(n_jobs=8)

In [None]:
sX_train_rf, sXt_train_rf, sy_train_rf, syt_train_rf = train_test_split(
    X_train_rf,
    y_train_rf,
    random_state=1,
    train_size=0.8,
    stratify=y_train)

clf.fit(sX_train_rf, sy_train_rf)

In [None]:
y_pred = clf.predict(sXt_train_rf)

In [None]:
f1_score(syt_train_rf, y_pred, average="macro")

### Gradient Boosting

In [None]:
from catboost import CatBoostClassifier

In [None]:
X_train_gb = X_train.copy()
y_train_gb = y_train.values.flatten()

In [None]:
clf = CatBoostClassifier(iterations=500,
                         depth=10,
                         learning_rate=0.9,
                         loss_function='MultiClass',
                         metric_period=50)

In [None]:
sX_train_gb, sXt_train_gb, sy_train_gb, syt_train_gb = train_test_split(
    X_train_gb,
    y_train_gb,
    random_state=1,
    train_size=0.8,
    stratify=y_train)

clf.fit(sX_train_gb, sy_train_gb)

In [None]:
y_pred = clf.predict(sXt_train_gb).flatten()

In [None]:
f1_score(syt_train_gb, y_pred, average="macro")

### Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
X_train_nn = X_train.copy()
y_train_nn = pd.get_dummies(y_train["casualty_severity"])

In [None]:
clf = Sequential()
clf.add(Dense(16, input_shape=(X_train_nn.shape[1],), activation="relu"))
clf.add(Dense(12, activation="relu"))
clf.add(Dense(16, activation="relu"))
clf.add(Dense(y_train_nn.shape[1], activation="softmax"))

clf.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
sX_train_nn, sXt_train_nn, sy_train_nn, syt_train_nn = train_test_split(
    X_train_nn,
    y_train_nn,
    random_state=1,
    train_size=0.8,
    stratify=y_train)

clf.fit(
    sX_train_nn,
    sy_train_nn,
    epochs=10)


In [None]:
y_pred = clf.predict(sXt_train_nn).argmax(axis=-1)

In [None]:
f1_score(pd.Series(syt_train_nn.columns[np.where(syt_train_nn!=0)[1]]) - 1, y_pred, average="macro")