# Dependencies

In [1]:
import torch
from torch.nn import L1Loss, MSELoss, BCELoss, CrossEntropyLoss

import matplotlib.pyplot as plt

# Linear & Logistic regression
<div style="display: flex; margin-top: 50px;">
    <div style="width: 15%;">
        <table style="margin-left: auto; margin-right: auto;">
            <caption>Dataset</caption>
            <tr>
                <th>#</th>
                <th><span style="color: cyan;">x<sub>1</span></th>
                <th><span style="color: #FF9999;">y</span></th>
            </tr>
            <tr>
                <th><span style="color: #99DD00">1</span></th>
                <td>1</td>
                <td>2</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">2</span></th>
                <td>2</td>
                <td>4</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">3</span></th>
                <td>3</td>
                <td>6</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">4</span></th>
                <td>4</td>
                <td>8</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">5</span></th>
                <td>5</td>
                <td>10</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">6</span></th>
                <td>6</td>
                <td>12</td>
            </tr>
        </table>
        <br>
        <br>
        <table style="margin-left: auto; margin-right: auto;">
            <caption>Dataset</caption>
            <tr>
                <th>#</th>
                <th><span style="color: cyan;">x<sub>1</span></th>
                <th><span style="color: #FF9999;">y</span></th>
            </tr>
            <tr>
                <th><span style="color: #99DD00">1</span></th>
                <td>1</td>
                <td>0</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">2</span></th>
                <td>2</td>
                <td>0</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">3</span></th>
                <td>3</td>
                <td>0</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">4</span></th>
                <td>4</td>
                <td>1</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">5</span></th>
                <td>5</td>
                <td>1</td>
            </tr>
            <tr>
                <th><span style="color: #99DD00">6</span></th>
                <td>6</td>
                <td>1</td>
            </tr>
        </table>
    </div>
    <div style="width: 85%;">
        <figure>
            <img src="./resources/images/linear-regression.svg" alt="Your Image" style="width: 100%;">
            <figcaption>Linear Regression Model</figcaption>
        </figure>
        <br>
        <figure>
            <img src="./resources/images/logistic-regression.svg" alt="Your Image" style="width: 100%;">
            <figcaption>Logistic Regression Model</figcaption>
        </figure>
    </div>
</div>

# Loss Function
   - Regression tasks
      - Mean Squared Error (MSE) Loss
         -  $L(\hat{y}, y) = \frac{1}{N} \sum_{i=1}^{N} (\hat{y}_i - y_i)^2$
      - Mean Absolute Error (MAE) Loss
         - $L(\hat{y}, y) = \frac{1}{N} \sum_{i=1}^{N} |\hat{y}_i - y_i|$
      - ...
   - Classification tasks
      - Binary Classification
         - Binary Cross-Entropy (MSE) Loss
            - $L(\hat{y}, y) = -\frac{1}{N} \sum_{i=1}^{N} [y_{i} \log(\hat{y}_{i}) + (1 - y_{i}) \log(1 - \hat{y}_{i})]$
         - ...
      - Multiclass Classification
         - Cross-Entropy Loss (Log Loss)
            - $L(\hat{y}, y) = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{C} y_{ij} \log(\hat{y}_{ij})$
         - ...

## BCELoss vs MSELoss
   1. Sensitivity to Probabilities [Logarithmic Scale]
      - BCE Loss operates on a logarithmic scale
      - BCELoss is more sensitive to the amound of error `[grows faster if the distance between y_true & y_pred is high]`
   2. Robustness to Outliers
      - BCELoss is typically more robust to outliers

In [2]:
# we have 3 samples for a binary classification
y_true = torch.tensor([[0], [0], [0]], dtype= torch.float32)

# output of model_1
output = torch.tensor([[0], [1.09864], [10]], dtype= torch.float32)
y_pred = torch.sigmoid(output)

mse_1 = MSELoss(reduction= 'none')(y_pred, y_true).squeeze()
mse_2 = MSELoss()(y_pred, y_true)
bce_1 = BCELoss(reduction= 'none')(y_pred, y_true).squeeze()
bce_2 = BCELoss()(y_pred, y_true)

# log
print(f"y_true: {y_true.squeeze()}")
print(f"y_pred: {y_pred.squeeze()}")
print('-' * 50)
print(f"MSELoss [per sample]: {mse_1}")
print(f"MSELoss             : {mse_2:.5f}")
print(f"BCELoss [per sample]: {bce_1}")
print(f"BCELoss             : {bce_2:.5f}")

y_true: tensor([0., 0., 0.])
y_pred: tensor([0.5000, 0.7500, 1.0000])
--------------------------------------------------
MSELoss [per sample]: tensor([0.2500, 0.5625, 0.9999])
MSELoss             : 0.60414
BCELoss [per sample]: tensor([0.6931, 1.3863, 9.9996])
BCELoss             : 4.02635


In [None]:
# plot
y_true = torch.zeros(size= (100, 1))
y_pred = torch.sigmoid(torch.linspace(-10, +10, 100).reshape(-1, 1))
bce_loss = BCELoss(reduction= 'none')(y_pred, y_true)
mse_loss = MSELoss(reduction= 'none')(y_pred, y_true)

plt.plot(y_pred, bce_loss, label= 'BCELoss')
plt.plot(y_pred, mse_loss, label= 'MSELoss')
plt.title(f"y_true = {y_true[0, 0]}   |   {y_pred.min().round()} <= y_pred <= {y_pred.max().round()}")
plt.xlabel("y_pred")
plt.ylabel("Loss")
plt.legend()
plt.show()