In [3]:
import torch
import matplotlib.pyplot as plt
import torch

from utils.TD import TD_SGD, TD_Adam

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import numpy as np

seed = 42
torch.manual_seed(seed)

<torch._C.Generator at 0x179bb459af0>

In [9]:
housedata = torch.tensor(np.loadtxt('data\\readyhousedata.txt', delimiter=','), dtype=torch.float32)

X = housedata[:, :-1]
y = housedata[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

reg = LinearRegression() 
reg.fit(X_train, y_train)

weights = torch.tensor(reg.coef_)
intercept = torch.tensor(reg.intercept_)

print("Weights learned using LR:\n", weights)
print("Intercept:", intercept)

score = reg.score(X_test, y_test)
print("Model R^2 score:", score)

Weights learned using LR:
 tensor([-10.9937,   5.0058,   0.6358,   2.4487, -10.3119,  17.5176,   1.0312,
        -17.7757,   7.3409,  -6.4276,  -8.7997,   3.0356, -20.5194])
Intercept: tensor(29.0443)
Model R^2 score: 0.7730569243431091


In [7]:
def mini_batch_sgd(
        X: torch.Tensor, 
        y: torch.Tensor, 
        learning_rate: float, 
        n_iter: int, 
        batch_size: int,
        epsilon: float = 0, 
    ) -> torch.Tensor:
    """
    Performs Mini-Batch Stochastic Gradient Descent (SGD) for linear regression with shuffling.
    
    Parameters:
    X (torch.Tensor): Feature matrix (n_samples, n_features).
    y (torch.Tensor): Target vector (n_samples,).
    learning_rate (float): Step size for updating weights.
    n_iter (int): Number of iterations (epochs).
    epsilon (float): Convergence threshold.
    batch_size (int): Size of the mini-batches.
    
    Returns:
    torch.Tensor: The learned weights.
    """
    n_samples, n_features = X.shape
    # Add bias term to the feature matrix
    X_bias = torch.cat([torch.ones(n_samples, 1), X], dim=1)  # Adds a column of ones for the bias term

    # Initialize weights to zeros
    weights = torch.zeros(n_features + 1)

    for epoch in range(int(n_iter)):
        indices = torch.randperm(n_samples)
        X_bias_shuffled = X_bias[indices]
        y_shuffled = y[indices]

        for i in range(0, n_samples, batch_size):
            X_batch = X_bias_shuffled[i:i + batch_size]
            y_batch = y_shuffled[i:i + batch_size]
            
            # Initialize gradients to zero
            gradient = torch.zeros_like(weights)
            
            # Compute the gradient over the mini-batch
            for j in range(X_batch.shape[0]):
                # Prediction
                prediction = torch.dot(X_batch[j], weights)
                error = y_batch[j] - prediction
                
                # Update gradient
                gradient += -2 * X_batch[j] * error

            # Update the weights
            weights -= learning_rate * gradient / batch_size

        # Check for convergence (if gradient is small enough)
        if torch.norm(gradient) < epsilon:
            print(f"Converged after {epoch + 1} epochs")
            break

    return weights

# Hyperparameters
learning_rate = 0.01
n_iter = 1e4  # Number of epochs
epsilon = 1e-6  # Convergence threshold
batch_size = 16  # Mini-batch size

# Run SGD manually
weights_sgd = mini_batch_sgd(X_train, y_train, learning_rate, n_iter, epsilon=epsilon, batch_size=batch_size)

# Print the learned weights
print("Weights learned using SGD:\n", weights_sgd[1:])

Weights learned using SGD:
 tensor([-10.9912,   5.0124,   0.6570,   2.4577, -10.2984,  17.5350,   1.0759,
        -17.7649,   7.3631,  -6.4073,  -8.7644,   3.0875, -20.5076])


In [11]:
num_samples = X_train.shape[0]
P = torch.ones((num_samples, num_samples)) / num_samples # Equal probability to move to any state

alpha = 0.01  # Learning rate
gamma = 0   # Discount factor
num_iterations = 1e5  # Number of iterations
epsilon = 1e-9

td_sgd = TD_SGD(
    n_iter=num_iterations,
    P=P,
    link=lambda x : x,
    inv_link=lambda x : x,
    gamma=gamma,
    alpha=alpha,
    epsilon=epsilon,
    random_state=seed,
)

td_sgd.fit(X_train, y_train)

w_hat_house = td_sgd.weights
bias_house = td_sgd.bias

print("Weights learned using TD SGD:\n", w_hat_house)

Weights learned using TD SGD:
 tensor([-1.0654e+01,  4.9181e+00,  9.4121e-03,  1.4318e+00, -1.0087e+01,
         1.7143e+01,  8.6331e-01, -1.7416e+01,  7.2050e+00, -6.5871e+00,
        -8.0881e+00,  2.8847e+00, -2.0169e+01])


In [12]:
alpha = 0.05  # Learning rate
gamma = 0   # Discount factor
num_iterations = 1e5  # Number of iterations
epsilon = 1e-9

td_adam = TD_Adam(
    n_iter=num_iterations,
    P=P,
    link=lambda x : x,
    inv_link=lambda x : x,
    gamma=gamma,
    alpha=alpha,
    epsilon=epsilon,
    random_state=seed,
)

td_adam.fit(X_train, y_train)

print("Weights learned using TD Adam:\n", td_adam.weights)

Ending optimization early at iteration 85489
Weights learned using TD Adam:
 tensor([-12.1528,   4.4889,   0.4160,   1.6978,  -9.6249,  16.9506,   1.4479,
        -18.0856,   7.0949,  -6.5179,  -8.8375,   2.6827, -21.3809])


In [15]:
pred_TD_sgd = td_sgd.predict(X_test)
pred_TD_adam = td_adam.predict(X_test)
pred_L2 = reg.predict(X_test)
pred_sgd = torch.matmul(X_test, weights_sgd[1:]) + weights_sgd[0]

rmse_TD_sgd = td_sgd.rmse(X_test, y_test)
rmse_TD_adam = td_adam.rmse(X_test, y_test)
rmse_L2 = torch.sqrt(torch.tensor(mean_squared_error(y_test, pred_L2)))
rmse_sgd = torch.sqrt(torch.tensor(mean_squared_error(y_test, pred_sgd)))

print(f"RMSE on the test set using TD SGD: {rmse_TD_sgd}")
print(f"RMSE on the test set using TD Adam: {rmse_TD_adam}")
print(f"RMSE on the test set using L2 Regression: {rmse_L2}")
print(f"RMSE on the test set using SGD: {rmse_sgd}")
print("---------------")
print(f"Norm of difference in weights for L2 and TD SGD: {torch.norm(weights - td_sgd.weights, 2)}")
print(f"Norm of difference in weights for sgd and TD SGD: {torch.norm(weights_sgd[1:] - td_sgd.weights, 2)}")
print(f"Norm of difference in weights for L2 and TD Adam: {torch.norm(weights - td_adam.weights, 2)}")
print(f"Norm of difference in weights for sgd and TD Adam: {torch.norm(weights_sgd[1:] - td_adam.weights, 2)}")


RMSE on the test set using TD SGD: 4.25275182723999
RMSE on the test set using TD Adam: 4.218838214874268
RMSE on the test set using L2 Regression: 4.154654026031494
RMSE on the test set using SGD: 4.178890705108643
---------------
Norm of difference in weights for L2 and TD SGD: 1.6105726957321167
Norm of difference in weights for sgd and TD SGD: 1.6218323707580566
Norm of difference in weights for L2 and TD Adam: 2.0549962520599365
Norm of difference in weights for sgd and TD Adam: 2.076366424560547
