In [None]:
import random

class LinearRegression:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
        self.weight = random.random()  # Initialize random weight
        self.bias = random.random()    # Initialize random bias

    def forward(self, x):
        # Prediction: y = wx + b
        return self.weight * x + self.bias

    def compute_loss(self, x, y_true):
        # Mean squared error loss
        y_pred = self.forward(x)
        return (y_pred - y_true) ** 2

    def backward(self, x, y_true):
        # Compute gradients
        y_pred = self.forward(x)
        error = y_pred - y_true

        # Gradient for weight: d(loss)/dw = 2(wx + b - y)x
        #see below explanation
        dw = 2 * error * x

        # Gradient for bias: d(loss)/db = 2(wx + b - y)
        db = 2 * error

        return dw, db

    def train_step(self, x, y_true):
        # Compute gradients
        dw, db = self.backward(x, y_true)

        # Update parameters using gradient descent
        self.weight -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def train(self, X, y, epochs=100):
        for epoch in range(epochs):
            total_loss = 0
            for x_i, y_i in zip(X, y):
                self.train_step(x_i, y_i)
                total_loss += self.compute_loss(x_i, y_i)

            if epoch % 10 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss/len(X)}")

# Example usage
if __name__ == "__main__":
    # Generate synthetic data
    X = [i for i in range(10)]
    y = [2*x + 1 + random.uniform(-0.5, 0.5) for x in X]  # y = 2x + 1 + noise

    # Create and train model
    model = LinearRegression(learning_rate=0.001)
    model.train(X, y, epochs=100)

    # Print final parameters
    print(f"\nFinal parameters:")
    print(f"Weight: {model.weight:.4f}")
    print(f"Bias: {model.bias:.4f}")

    # Test predictions
    test_x = 5
    prediction = model.forward(test_x)
    print(f"\nPrediction for x={test_x}: {prediction:.4f}")

Epoch 0, Loss: 36.757080029575256
Epoch 10, Loss: 0.19977622757534674
Epoch 20, Loss: 0.18659541133428664
Epoch 30, Loss: 0.17534905316806532
Epoch 40, Loss: 0.16528948538440674
Epoch 50, Loss: 0.15628952113809388
Epoch 60, Loss: 0.14823631434145768
Epoch 70, Loss: 0.14102910792648632
Epoch 80, Loss: 0.134577933743953
Epoch 90, Loss: 0.12880245359361447

Final parameters:
Weight: 2.0399
Bias: 0.6807

Prediction for x=5: 10.8804


Let me break down how we derive these gradient formulas step by step:

First, let's write out our equations:

Prediction: ŷ = wx + b
Loss function (MSE): L = (ŷ - y)² = (wx + b - y)²


For gradient descent, we need ∂L/∂w and ∂L/∂b. We'll use the chain rule.
For weight (w):

Using chain rule on L = (wx + b - y)²
∂L/∂w = 2(wx + b - y) * ∂(wx + b - y)/∂w
∂(wx + b - y)/∂w = x
Therefore: ∂L/∂w = 2(wx + b - y)x
In code: dw = 2 * error * x where error = (wx + b - y)


For bias (b):

Using chain rule again on L = (wx + b - y)²
∂L/∂b = 2(wx + b - y) * ∂(wx + b - y)/∂b
∂(wx + b - y)/∂b = 1
Therefore: ∂L/∂b = 2(wx + b - y)
In code: db = 2 * error where error = (wx + b - y)

In [None]:
import numpy as np

class LinearRegression:
    def __init__(self, n_features=3, learning_rate=0.01):
        self.learning_rate = learning_rate
        # Initialize random weights as a column vector
        self.weights = np.random.rand(n_features, 1)
        self.bias = np.random.rand()

    def forward(self, X):
        """
        Forward pass using matrix multiplication
        X: shape (n_samples, n_features)
        """
        # Reshape X to ensure proper multiplication if X is a single sample
        if X.ndim == 1:
            X = X.reshape(1, -1)

        # y = Xw + b
        return np.dot(X, self.weights) + self.bias

    def compute_loss(self, X, y_true):
        """
        Mean squared error loss
        """
        y_pred = self.forward(X)
        # Ensure shapes match for subtraction
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)
        return np.mean((y_pred - y_true) ** 2)

    def backward(self, X, y_true):
        """
        Compute gradients
        """

        # This checks if X is a 1-dimensional array (like a single sample with 3 features). If it is, it reshapes it into a 2D array with 1 row and the same number of columns. The -1 tells NumPy to automatically determine the appropriate size for that dimension. So a 1D array like [0.5, 0.3, 0.8] becomes a 2D array [[0.5, 0.3, 0.8]]. This ensures X has the shape (n_samples, n_features) needed for matrix multiplication.

        # Similarly, this checks if y_true is a 1D array (like [1, 2, 3]). If it is, it reshapes it into a column vector by adding a second dimension, turning it into something like [[1], [2], [3]]. The shape becomes (n_samples, 1), which is the expected format for the target values in matrix calculations.

        if X.ndim == 1:
            X = X.reshape(1, -1)
        if y_true.ndim == 1:
            y_true = y_true.reshape(-1, 1)

        y_pred = self.forward(X)
        error = y_pred - y_true

        # Gradient for weights: dL/dw = (1/m) * X^T * (Xw - y)
        dw = (1/X.shape[0]) * np.dot(X.T, error)

        # Gradient for bias: dL/db = (1/m) * sum(Xw - y)
        db = np.mean(error)

        return dw, db

    def train_step(self, X, y_true):
        """
        Single training step with all data
        """
        dw, db = self.backward(X, y_true)
        self.weights -= self.learning_rate * dw
        self.bias -= self.learning_rate * db

    def train(self, X, y, epochs=100):
        """
        Training loop
        """
        X = np.array(X)
        y = np.array(y)

        for epoch in range(epochs):
            self.train_step(X, y)

            # Compute and print loss periodically
            if epoch % 10 == 0:
                loss = self.compute_loss(X, y)
                print(f"Epoch {epoch}, Loss: {loss:.6f}")

# Example usage
if __name__ == "__main__":
    # Generate synthetic data with 3 features
    np.random.seed(42)  # For reproducibility
    n_samples = 100

    # Create feature matrix X with 3 features
    X = np.random.rand(n_samples, 3)

    # True weights and bias
    true_weights = np.array([2.0, -1.5, 3.0]).reshape(-1, 1)
    true_bias = 1.0

    # Generate target values: y = Xw + b + noise
    y = np.dot(X, true_weights) + true_bias + np.random.normal(0, 0.5, (n_samples, 1))

    # Create and train model
    model = LinearRegression(n_features=3, learning_rate=0.1)
    model.train(X, y, epochs=100)

    # Print final parameters
    print(f"\nFinal parameters:")
    print(f"Weights:\n{model.weights.flatten()}")
    print(f"Bias: {model.bias:.4f}")

    # Test predictions
    test_X = np.array([0.5, 0.3, 0.8])
    prediction = model.forward(test_X)[0][0]
    expected = np.dot(test_X.reshape(1, -1), true_weights) + true_bias
    print(f"\nTest input: {test_X}")
    print(f"Prediction: {prediction:.4f}")
    print(f"Expected (without noise): {expected[0][0]:.4f}")

This line is calculating the gradient of the loss function with respect to the weights (the derivative of the loss with respect to each weight parameter). Let me break it down:
dw = (1/X.shape[0]) * np.dot(X.T, error)

X.shape[0] gives the number of samples in your dataset (the number of rows in X).
1/X.shape[0] is the scaling factor that averages the gradient across all samples.
X.T is the transpose of the feature matrix X. If X is shape (n_samples, n_features), then X.T becomes shape (n_features, n_samples).
error is the difference between predicted values and actual values: y_pred - y_true. This has shape (n_samples, 1).
np.dot(X.T, error) performs matrix multiplication between the transposed feature matrix and the error vector. The resulting shape is (n_features, 1), which gives you the gradient for each weight parameter.

Mathematically, this operation is computing:
dL/dw = (1/m) * X^T * (Xw - y)
Where:

m is the number of samples
X^T is the transpose of X
(Xw - y) is the prediction error

This gradient tells you how much each weight contributes to the error and in which direction to adjust each weight to reduce the loss. Multiplying by (1/m) averages the gradient across all training examples.
In the context of linear regression with MSE loss, this is the exact analytical gradient formula that points in the direction of steepest descent of the loss function.

L = (1/m) * Σ(y_pred - y_true)²

L = (1/m) * (Xw + b - y)^T * (Xw + b - y)
L = (1/m) * [(Xw)^T - y^T] * [Xw - y]
L = (1/m) * [w^T * X^T - y^T] * [Xw - y]
L = (1/m) * [w^T * X^T * Xw - w^T * X^T * y - y^T * Xw + y^T * y]

∂L/∂w = (1/m) * [2*X^T*X*w - X^T*y - X^T*y + 0]
      = (1/m) * [2*X^T*X*w - 2*X^T*y]
      = (1/m) * 2*X^T*(Xw - y)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

class LinearRegression(nn.Module):
    def __init__(self, n_features=3):
        super(LinearRegression, self).__init__()
        self.linear = nn.Linear(n_features, 1)  # Linear layer: y = Xw + b

    def forward(self, X):
        return self.linear(X)

# Generate synthetic data
torch.manual_seed(42)  # For reproducibility
n_samples = 100
n_features = 3

# Create feature matrix X with 3 features
X = torch.rand(n_samples, n_features)

# True weights and bias
true_weights = torch.tensor([2.0, -1.5, 3.0]).reshape(-1, 1)
true_bias = 1.0

# Generate target values: y = Xw + b + noise
y = X @ true_weights + true_bias + torch.randn(n_samples, 1) * 0.5

# Initialize model, loss function, and optimizer
model = LinearRegression(n_features=n_features)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Training loop
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()  # Reset gradients
    predictions = model(X)  # Forward pass
    loss = criterion(predictions, y)  # Compute loss
    loss.backward()  # Backpropagation
    optimizer.step()  # Update parameters

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.6f}")

# Print final parameters
print("\nFinal parameters:")
print(f"Weights: {model.linear.weight.data.flatten().tolist()}")
print(f"Bias: {model.linear.bias.item():.4f}")

# Test predictions
test_X = torch.tensor([[0.5, 0.3, 0.8]])
prediction = model(test_X).item()
expected = (test_X @ true_weights + true_bias).item()
print(f"\nTest input: {test_X.flatten().tolist()}")
print(f"Prediction: {prediction:.4f}")
print(f"Expected (without noise): {expected:.4f}")
