In [None]:
import random

class LinearRegression:
    def __init__(self, learning_rate=0.01):
        self.bias = random.random()
        self.weight = random.random()
        self.learning_rate = learning_rate

    def forward(self, x):
        return x*self.weight + self.bias

    def compute_loss(self, x, y_true):
        y_pred = self.forward(x)
        loss = (y_pred - y_true)**2
        return loss 

    def backward(self,x, y_true):
        y_pred = self.forward(x)
        error = y_pred - y_true
        dw = 2*error*x
        db = 2*error 

        # 	The gradient tells you the direction of steepest increase of the loss.
	    # To minimize, you must move in the opposite direction:
        self.weight = self.weight - dw * self.learning_rate
        self.bias = self.bias - db * self.learning_rate

    def train(self, X, y, epochs=1000):
        for epoch in range(epochs):
            loss = 0
            for x_i, y_i in zip(X, y):
                loss += self.compute_loss(x_i, y_i)
                self.backward(x_i,y_i)
            if epoch%100==0:
                print('cur loss=',loss)



# Example usage
if __name__ == "__main__":
    # Generate synthetic data
    X = [i for i in range(10)]
    y = [2*x + 1 + random.uniform(-0.5, 0.5) for x in X]  # y = 2x + 1 + noise

    # Create and train model
    model = LinearRegression(learning_rate=0.0001)
    model.train(X, y, epochs=1000)

    # Print final parameters
    print(f"\nFinal parameters:")
    print(f"Weight: {model.weight:.4f}")
    print(f"Bias: {model.bias:.4f}")

    # Test predictions
    test_x = 5
    prediction = model.forward(test_x)
    print(f"\nPrediction for x={test_x}: {prediction:.4f}")

cur loss= 1127.4268297392841
cur loss= 0.6667051248802688
cur loss= 0.6540229707238138
cur loss= 0.6509225577995307
cur loss= 0.6481547128434395
cur loss= 0.6456830120566202
cur loss= 0.6434757796659846
cur loss= 0.641504729198289
cur loss= 0.6397445989737351
cur loss= 0.6381728286808036

Final parameters:
Weight: 2.0082
Bias: 1.0203

Prediction for x=5: 11.0615


Let me break down how we derive these gradient formulas step by step:

First, let's write out our equations:

Prediction: ŷ = wx + b
Loss function (MSE): L = (ŷ - y)² = (wx + b - y)²


For gradient descent, we need ∂L/∂w and ∂L/∂b. We'll use the chain rule.
For weight (w):

Using chain rule on L = (wx + b - y)²
∂L/∂w = 2(wx + b - y) * ∂(wx + b - y)/∂w
∂(wx + b - y)/∂w = x
Therefore: ∂L/∂w = 2(wx + b - y)x
In code: dw = 2 * error * x where error = (wx + b - y)


For bias (b):

Using chain rule again on L = (wx + b - y)²
∂L/∂b = 2(wx + b - y) * ∂(wx + b - y)/∂b
∂(wx + b - y)/∂b = 1
Therefore: ∂L/∂b = 2(wx + b - y)
In code: db = 2 * error where error = (wx + b - y)