In [1]:
import numpy as np

from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
x, y = load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

print(x_train_scaled.shape, y_train.shape)
print(x_test_scaled.shape, y_test.shape)

(353, 10) (353,)
(89, 10) (89,)


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim

## regression: loss function and metric
mse_loss = nn.MSELoss()     

def r2_score(y_pred, y_true):
    mean_y = torch.mean(y_true)
    ss_tot = torch.sum((y_true - mean_y)**2)
    ss_res = torch.sum((y_true - y_pred)**2)  
    return 1 - (ss_res / ss_tot) 

# Hyperparameters
n_epochs = 50000
learning_rate = 0.1

input_dim = 10
hidden_dim = 100
output_dim = 1

# Data
x = torch.tensor(x_train_scaled).float()
y = torch.tensor(y_train).float().view(-1, 1)
print(x.shape, y.shape)

torch.Size([353, 10]) torch.Size([353, 1])


### [Method-1] Manual Backpropagation + Manual Update

In [None]:
## Method - 1
## Model
torch.manual_seed(42)
w1 = torch.randn(input_dim, hidden_dim).requires_grad_(False)
b1 = torch.zeros(hidden_dim).requires_grad_(False)
w2 = torch.randn(hidden_dim, output_dim).requires_grad_(False)
b2 = torch.zeros(output_dim).requires_grad_(False)

for epoch in range(1, n_epochs + 1):
    # Forward propagation
    z1 = torch.mm(x, w1) + b1
    a1 = torch.sigmoid(z1)
    z2 = torch.mm(a1, w2) + b2
    a2 = z2     ## Identity activation

    y_pred = a2
    loss = mse_loss(y_pred, y)
    score = r2_score(y_pred, y)

    # Backward progapation
    grad_a2 = 2 * (a2 - y) / y.shape[0]
    grad_z2 = grad_a2
    grad_w2 = torch.mm(a1.t(), grad_z2)
    grad_b2 = torch.sum(grad_z2, dim=0)
    
    grad_a1 = torch.mm(grad_z2, w2.t())
    grad_z1 = a1 * (1 - a1) * grad_a1
    grad_w1 = torch.mm(x.t(), grad_z1)
    grad_b1 = torch.sum(grad_z1, dim=0)

    # Update weights and biases
    w1 -= learning_rate * grad_w1
    b1 -= learning_rate * grad_b1
    w2 -= learning_rate * grad_w2
    b2 -= learning_rate * grad_b2

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {loss.item():.4f} score: {score:.4f}")

[5000/50000] loss: 1779.0012 score: 0.7072
[10000/50000] loss: 1635.9414 score: 0.7308
[15000/50000] loss: 1564.4283 score: 0.7425
[20000/50000] loss: 1514.8424 score: 0.7507
[25000/50000] loss: 1503.7875 score: 0.7525
[30000/50000] loss: 1497.9303 score: 0.7535
[35000/50000] loss: 1496.4808 score: 0.7537
[40000/50000] loss: 1496.4729 score: 0.7537
[45000/50000] loss: 1491.8660 score: 0.7545
[50000/50000] loss: 1489.6545 score: 0.7548


### [Method-2] torch.autograd.grad() + Manual Update

In [None]:
## Method - 2
## Model
torch.manual_seed(42)
w1 = torch.randn(input_dim, hidden_dim).requires_grad_(True)
b1 = torch.zeros(hidden_dim).requires_grad_(True)
w2 = torch.randn(hidden_dim, output_dim).requires_grad_(True)
b2 = torch.zeros(output_dim).requires_grad_(True)

for epoch in range(1, n_epochs + 1):
    # Forward propagation
    z1 = torch.mm(x, w1) + b1
    a1 = torch.sigmoid(z1)
    z2 = torch.mm(a1, w2) + b2
    a2 = z2     ## Identity activation

    y_pred = a2
    loss = mse_loss(y_pred, y)
    score = r2_score(y_pred, y)

    # Backward progapation
    grads = torch.autograd.grad(loss, [w1, b1, w2, b2], create_graph=True)

    # Update weights and biases
    with torch.no_grad():
        w1 -= learning_rate * grads[0]
        b1 -= learning_rate * grads[1]
        w2 -= learning_rate * grads[2]
        b2 -= learning_rate * grads[3]
        
    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {loss.item():.4f} score: {score:.4f}")

[5000/50000] loss: 1842.9210 score: 0.6967
[10000/50000] loss: 1635.1206 score: 0.7309
[15000/50000] loss: 1584.1705 score: 0.7393
[20000/50000] loss: 1552.9069 score: 0.7444
[25000/50000] loss: 1542.6190 score: 0.7461
[30000/50000] loss: 1494.4867 score: 0.7541
[35000/50000] loss: 1491.6666 score: 0.7545
[40000/50000] loss: 1486.2555 score: 0.7554
[45000/50000] loss: 1463.4923 score: 0.7592
[50000/50000] loss: 1445.4012 score: 0.7621


### [Method-3] loss.backward() + Manual Update

In [5]:
## Model - 3
torch.manual_seed(42)
w1 = torch.randn(input_dim, hidden_dim).requires_grad_(True)
b1 = torch.zeros(hidden_dim).requires_grad_(True)
w2 = torch.randn(hidden_dim, output_dim).requires_grad_(True)
b2 = torch.zeros(output_dim).requires_grad_(True)

for epoch in range(1, n_epochs + 1):
    # Forward propagation
    z1 = torch.mm(x, w1) + b1
    a1 = torch.sigmoid(z1)
    z2 = torch.mm(a1, w2) + b2
    a2 = z2     ## Identity activation

    y_pred = a2
    loss = mse_loss(y_pred, y)
    score = r2_score(y_pred, y)

    # Backward progapation
    loss.backward()

    # Update weights and biases
    with torch.no_grad():    
        w1 -= learning_rate * w1.grad
        b1 -= learning_rate * b1.grad
        w2 -= learning_rate * w2.grad
        b2 -= learning_rate * b2.grad

        w1.grad.zero_()
        b1.grad.zero_()
        w2.grad.zero_()
        b2.grad.zero_()

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {loss.item():.4f} score: {score:.4f}")

[5000/50000] loss: 1842.9210 score: 0.6967
[10000/50000] loss: 1635.1206 score: 0.7309
[15000/50000] loss: 1584.1705 score: 0.7393
[20000/50000] loss: 1552.9069 score: 0.7444
[25000/50000] loss: 1542.6190 score: 0.7461
[30000/50000] loss: 1494.4867 score: 0.7541
[35000/50000] loss: 1491.6666 score: 0.7545
[40000/50000] loss: 1486.2555 score: 0.7554
[45000/50000] loss: 1463.4923 score: 0.7592
[50000/50000] loss: 1445.4012 score: 0.7621


### [Method-4] loss.backward() + Optimizer

In [6]:
## Method - 4
## Model
torch.manual_seed(42)
w1 = torch.randn(input_dim, hidden_dim).requires_grad_(True)
b1 = torch.zeros(hidden_dim).requires_grad_(True)
w2 = torch.randn(hidden_dim, output_dim).requires_grad_(True)
b2 = torch.zeros(output_dim).requires_grad_(True)

optimizer = optim.SGD([w1, b1, w2, b2], lr=0.1)

for epoch in range(1, n_epochs + 1):
    # Forward propagation
    lin1 = torch.mm(x, w1) + b1
    act1 = torch.sigmoid(lin1)
    out = torch.mm(act1, w2) + b2
    y_pred = out

    loss = mse_loss(y_pred, y)
    score = r2_score(y_pred, y)

    # Backward progapation
    loss.backward()

    # Update weights and biases
    optimizer.step()
    optimizer.zero_grad()

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {loss.item():.4f} score: {score:.4f}")

[5000/50000] loss: 1803.7753 score: 0.7032
[10000/50000] loss: 1682.4138 score: 0.7231
[15000/50000] loss: 1582.1388 score: 0.7396
[20000/50000] loss: 1579.2837 score: 0.7401
[25000/50000] loss: 1562.5255 score: 0.7429
[30000/50000] loss: 1557.1533 score: 0.7437
[35000/50000] loss: 1556.9713 score: 0.7438
[40000/50000] loss: 1556.1246 score: 0.7439
[45000/50000] loss: 1556.5000 score: 0.7438
[50000/50000] loss: 1529.8906 score: 0.7482


## [Method-5] nn.Module + loss.backward() + Optimizer

In [7]:
## Method - 5
## Model
# model = nn.Sequential(
#     nn.Linear(input_dim, hidden_dim),
#     nn.Sigmoid(),
#     nn.Linear(hidden_dim, output_dim),
# )
# optimizer = optim.Adam(model.parameters(), lr=0.001)

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, output_dim)
        
        ## initialization
        torch.nn.init.normal_(self.linear1.weight)
        torch.nn.init.normal_(self.linear2.weight)

    def forward(self, x):
        x = self.linear1(x)
        x = torch.sigmoid(x)
        x = self.linear2(x)  
        return x

model = MLP(input_dim, hidden_dim, output_dim)
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(1, n_epochs + 1):
    # Forward propagation
    y_pred = model(x)
    loss = mse_loss(y_pred, y)
    score = r2_score(y_pred, y)

    # Backward progapation
    loss.backward()

    # Update weights and biases
    optimizer.step()
    optimizer.zero_grad()

    if epoch % (n_epochs // 10) == 0:
        print(f"[{epoch}/{n_epochs}] loss: {loss.item():.4f} score: {score:.4f}")

[5000/50000] loss: 2177.4368 score: 0.6417
[10000/50000] loss: 2098.9351 score: 0.6546
[15000/50000] loss: 1947.5015 score: 0.6795
[20000/50000] loss: 1906.4879 score: 0.6862
[25000/50000] loss: 1844.3549 score: 0.6965
[30000/50000] loss: 1824.8132 score: 0.6997
[35000/50000] loss: 1805.3464 score: 0.7029
[40000/50000] loss: 1804.2379 score: 0.7031
[45000/50000] loss: 1786.3413 score: 0.7060
[50000/50000] loss: 1781.1143 score: 0.7069
