📝 **Author:** Amirhossein Heydari - 📧 **Email:** amirhosseinheydari78@gmail.com - 📍 **Linktree:** [linktr.ee/mr_pylin](https://linktr.ee/mr_pylin)

---

# Dependencies

In [16]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from torch import nn, optim

In [17]:
# set a seed for deterministic results
seed = 42

# Utility Function to Plot Decision Regions

In [18]:
def plot_decision_regions(X: np.ndarray, y: np.ndarray, classifier, resolution: float = 0.01) -> None:
    # setup marker generator and color map
    markers = ("o", "s")
    colors = ("red", "blue")
    cmap = ListedColormap(colors)

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution), np.arange(x2_min, x2_max, resolution))
    y_pred = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T).reshape(xx1.shape)
    plt.contourf(xx1, xx2, y_pred, alpha=0.2, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    # plot class examples
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(
            x=X[y == cl, 0],
            y=X[y == cl, 1],
            alpha=0.8,
            c=colors[idx],
            marker=markers[idx],
            label=f"Class {cl}",
            edgecolor="black",
        )

# Load Iris Dataset

<figure style="text-align: center;">
    <img src="../assets/images/third_party/01_08.png" alt="01_08.png" style="width: 50%;">
    <figcaption style="text-align: center;">©️ Image: <a href= "https://github.com/rasbt/machine-learning-book/blob/main/ch01/figures/01_08.png">Machine Learning with PyTorch and Scikit-Learn</a></figcaption>
</figure>

In [None]:
# iris dataset as a pandas data-frame
iris_df = pd.read_csv(
    r"https://raw.githubusercontent.com/mr-pylin/datasets/refs/heads/main/data/tabular-data/iris/dataset.csv",
    encoding="utf-8",
)

# log
iris_df.head()

In [None]:
# number of unique labels
unique_classes = iris_df.iloc[:, -1].unique()

# number of data per label
num_data_per_class = iris_df.iloc[:, -1].value_counts()

# log
print(f"Unique labels: {unique_classes}")
print(f"Number of data per label: {num_data_per_class}")

In [None]:
# we only need species: {'Iris-setosa', 'Iris-versicolor'}
filtered_iris_df = iris_df[iris_df.iloc[:, -1].isin(["Iris-setosa", "Iris-versicolor"])]

# select only the sepal length(first column) and petal length(third column)
filtered_iris_df = filtered_iris_df.iloc[:, [0, 2, -1]]

# split features and labels
X = filtered_iris_df.iloc[:, [0, 1]].values
y = filtered_iris_df.iloc[:, [2]].values.squeeze()

# convert labels into numbers : {'Iris-setosa':0, 'Iris-versicolor':1}
y = np.where(y == "Iris-setosa", 0, 1)

# log
print(f"X.shape : {X.shape}")
print(f"X.dtype : {X.dtype}")
print(f"y.shape : {y.shape}")
print(f"y.dtype : {y.dtype}")

In [None]:
# plot data
plt.scatter(X[:50, 0], X[:50, 1], color="red", marker="o", label="Iris-setosa:0")
plt.scatter(X[50:100, 0], X[50:100, 1], color="blue", marker="s", label="Iris-versicolor:1")
plt.xlabel("Sepal length [cm]")
plt.ylabel("Petal length [cm]")
plt.legend()
plt.show()

# Perceptron
   - The Perceptron is a type of **binary linear classifier** introduced by [**Frank Rosenblatt**](https://en.wikipedia.org/wiki/Frank_Rosenblatt) in 1958.
   - It's the simplest form of a neural network, consisting of a single layer of input neurons connected to a single output neuron.

🔬 **Formulations**:
      $$
      \hat{y} = \begin{cases} 
      1 & \text{if} \; \sum w_i x_i + b \ge 0 \\
      0 & \text{otherwise}
      \end{cases}
      $$

  - $w_i$: Weights for input $x_i$
  - $b$: Bias term

📝 **Paper**: [THE PERCEPTRON: A PROBABILISTIC MODEL FOR INFORMATION STORAGE AND ORGANIZATION IN THE BRAIN](https://www.ling.upenn.edu/courses/cogs501/Rosenblatt1958.pdf)

<figure style="text-align: center;">
    <img src="../assets/images/original/perceptron/perceptron-2.svg" alt="perceptron-2.svg" style="width: 80%;">
    <figcaption style="text-align: center;">A Perceptron</figcaption>
</figure>

---

**Training Rule (Rosenblatt's Rule)**:
$$
w_{i+1} \leftarrow w_i + \eta \cdot (y_{\text{true}} - y_{\text{pred}}) \cdot x_i
$$
$$
b_{i+1} \leftarrow b_i + \eta \cdot (y_{\text{true}} - y_{\text{pred}})
$$
   - $\eta$: Learning rate
   - $y_{true}$: Actual class label
   - $y_{pred}$: Predicted class label

✍️ **Notes**:
   - The original perceptron algorithm as described in the **Frank Rosenblatt**'s paper, there are some differences from what is written in this notebook:
     - The labels are typically $-1$ and $+1$, instead of $0$ and $+1$.
     - It did not use a **learning rate** hyperparameter.
     - The threshold function in the original perceptron is a **sign** function rather than **step** function.
     - Bias is referred to as $\theta$, not $b$. $\theta$ played the same role as the bias term but in the **threshold function**.
   -  The **gradient descent** approach is **not** directly applicable to the **learning rule** of the **original perceptron** algorithm.
   - The following code is adapted from the book *[Machine Learning with PyTorch and Scikit-Learn](https://github.com/rasbt/machine-learning-book)* with **modifications** made to fit the requirements of this analysis.

In [23]:
class Perceptron:
    def __init__(self, eta: float = 0.01, epochs: int = 50, seed: int = seed) -> None:
        self.eta = eta
        self.epochs = epochs
        self.seed = seed

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        # initialize weights and bias following a normal distribution with a deterministic seed
        rng = np.random.default_rng(seed=self.seed)
        self.w_ = rng.normal(loc=0, scale=0.01, size=X.shape[1])
        self.b_ = rng.normal(loc=0, scale=0.01, size=1)

        # to collect errors per epoch
        self.errors = []

        # train loop
        for epoch in range(self.epochs):
            errors = 0
            for x, y_true in zip(X, y):

                # output of the perceptron
                y_pred = self.predict(x)

                # update w_ and b_
                update_step = self.eta * (y_true - y_pred)
                self.w_ += update_step * x
                self.b_ += update_step

                # count number of updates in the current epoch
                if update_step != 0:
                    errors += 1

            self.errors.append(errors)

    def predict(self, x: np.ndarray) -> np.ndarray:
        output = np.dot(x, self.w_) + self.b_
        unit_function = np.where(output >= 0, 1, 0)
        return unit_function

In [24]:
# initialize a perceptron
perceptron = Perceptron(eta=0.1, epochs=10)

# fit dataset to the model
perceptron.fit(X, y)

In [None]:
# plot errors per epoch
plt.plot(perceptron.errors, marker="o")
plt.xticks(range(10))
plt.xlabel("Epochs")
plt.ylabel("Number of updates")
plt.show()

In [None]:
# plot decision boundary
plot_decision_regions(X, y, classifier=perceptron)
plt.xlabel("Sepal length [cm]")
plt.ylabel("Petal length [cm]")
plt.legend()
plt.show()

# Adaptive Linear Neurons (AdaLiNe)
   - The Adaline, developed by [Bernard Widrow](https://en.wikipedia.org/wiki/Bernard_Widrow) and [Ted Hoff](https://en.wikipedia.org/wiki/Marcian_Hoff) in 1960.
   - It is a refinement of the perceptron, but it uses a different cost function and activation.

<figure style="text-align: center;">
    <img src="../assets/images/original/perceptron/adaline.svg" alt="adaline.svg" style="width: 80%;">
    <figcaption style="text-align: center;">Adaptive Linear Neurons</figcaption>
</figure>

---

**Training Rule (Widrow-Hoff Rule)**:
   - Mean Squared Error (MSE) Loss:
   $$
   L(\mathbf{w}, b) = \frac{1}{2n} \sum_{i=1}^{n} \left( y^{(i)} - \mathbf{w}^T \mathbf{x}^{(i)} - b \right)^2
   $$
   - Weights update:
   $$
   \Delta \mathbf{w} = -\eta \nabla_{w}L(\mathbf{w}, b) = \eta \frac{1}{n} \sum_{i=1}^{n} \left( y^{(i)} - \mathbf{w}^T \mathbf{x}^{(i)} - b \right) \mathbf{x}^{(i)}
   $$
   $$
   \mathbf{w_{i+1}} \leftarrow \mathbf{w_i} + \Delta \mathbf{w}
   $$
   - Bias update:
   $$
   \Delta b = -\eta \nabla_{b}L(\mathbf{w}, b) = \eta \frac{1}{n} \sum_{i=1}^{n} \left( y^{(i)} - \mathbf{w}^T \mathbf{x}^{(i)} - b \right)
   $$
   $$
   \mathbf{b_{i+1}} \leftarrow \mathbf{b_i} + \Delta b
   $$

✍️ Note:
   - The following code is adapted from the book *[Machine Learning with PyTorch and Scikit-Learn](https://github.com/rasbt/machine-learning-book)* with **modifications** made to fit the requirements of this analysis.

In [27]:
class AdaLiNe:
    def __init__(self, eta: float = 0.01, epochs: int = 50, seed: int = seed) -> None:
        self.eta = eta
        self.epochs = epochs
        self.seed = seed

    def fit(self, X: np.ndarray, y: np.ndarray) -> None:
        # initialize weights and bias following a normal distribution with a deterministic seed
        rng = np.random.default_rng(seed=self.seed)
        self.w_ = rng.normal(loc=0, scale=0.01, size=X.shape[1])
        self.b_ = rng.normal(loc=0, scale=0.01, size=1)

        # to collect losses per epoch
        self.losses_ = []

        # training loop
        for epoch in range(self.epochs):

            # output of the adaline (before passing to threshold function)
            net_input = self.net_input(X)
            output = self.activation(net_input)

            # update w_ and b_
            errors = y - output
            self.w_ += self.eta * X.T.dot(errors) / X.shape[0]
            self.b_ += self.eta * errors.mean()

            # calculate loss function (MSE in this case)
            loss = (errors**2).mean() / 2
            self.losses_.append(loss)

        return self

    def net_input(self, X):
        return np.dot(X, self.w_) + self.b_

    def activation(self, X):
        return X

    def predict(self, X):
        return np.where(self.activation(self.net_input(X)) >= 0.5, 1, 0)

In [28]:
num_epochs = 20

# initialize several adaline using different learning rates and standardizing inputs
adaline_1 = AdaLiNe(epochs=num_epochs, eta=0.1).fit(X, y)
adaline_2 = AdaLiNe(epochs=num_epochs, eta=0.002).fit(X, y)

# standardize inputs to have mean=0 and std=1
X_std = (X - X.mean(axis=0)) / X.std(axis=0)
adaline_3 = AdaLiNe(epochs=num_epochs, eta=0.5).fit(X_std, y)

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 4))
ax[0].plot(range(1, len(adaline_1.losses_) + 1), np.log10(adaline_1.losses_), marker="o")
ax[0].set(
    title=f"Adaline - Learning rate {adaline_1.eta}",
    xlabel="Epochs",
    ylabel="log(Mean squared error)",
    xticks=range(num_epochs + 1),
)
ax[1].plot(range(1, len(adaline_2.losses_) + 1), adaline_2.losses_, marker="o")
ax[1].set(
    title=f"Adaline - Learning rate {adaline_2.eta}",
    xlabel="Epochs",
    ylabel="Mean squared error",
    xticks=range(num_epochs + 1),
)
ax[2].plot(range(1, len(adaline_3.losses_) + 1), adaline_3.losses_, marker="o")
ax[2].set(
    title=f"Adaline - Learning rate {adaline_3.eta} + Standardize Input",
    xlabel="Epochs",
    ylabel="Mean squared error",
    xticks=range(num_epochs + 1),
)
plt.show()

In [None]:
# plot decision region
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 6), layout="compressed")
plt.sca(axes[0])
plot_decision_regions(X, y, adaline_1)
plt.title("Adaline - Learning rate 0.1")
plt.legend()
plt.sca(axes[1])
plot_decision_regions(X, y, adaline_2)
plt.title("Adaline - Learning rate 0.0001")
plt.legend()
plt.sca(axes[2])
plot_decision_regions(X_std, y, adaline_3)
plt.title("Adaline - Learning rate 0.5 + Standardize")
plt.legend()
plt.show()