In [None]:
import re
from collections import Counter

class LinearRegressionTextClassifier:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = {}
        self.bias = 0
        self.vocab = set()

    def preprocess(self, text):
        """Convert text to lowercase and split into words."""
        return re.findall(r'\b[a-z]+\b', text.lower())

    def vectorize(self, text):
        """Convert text into a feature vector (Bag of Words)."""
        dict1 = {}
        words = self.preprocess(text)
        # for word in self.vocab:
        #     dict1[word] = words.count(word)
        dict1 = Counter(words)
        return dict1

    def build_vocab(self, texts):
        """Create a vocabulary from the training data."""
        for text in texts:
            words = self.preprocess(text)
            self.vocab.update(words)

    def fit(self, X_train, y_train):
        """Train the model using gradient descent."""
        self.build_vocab(X_train)

        # Initialize weights for each word in the vocabulary
        for word in self.vocab:
            self.weights[word] = 0.0

        for epoch in range(self.epochs):
            total_loss = 0

            for text, y in zip(X_train, y_train):
                features = self.vectorize(text)
                y_pred = self.bias  # Start with bias

                # Compute weighted sum
                for word, count in features.items():
                    y_pred += self.weights[word] * count

                # Compute error
                error = y_pred - y
                total_loss += error ** 2

                # Update weights using gradient descent
                for word, count in features.items():
                    self.weights[word] -= self.learning_rate * error * count

                # Update bias
                self.bias -= self.learning_rate * error

            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {total_loss}")

    def predict(self, X_test):
        """Predict the class (rounded regression output)."""
        predictions = []
        for text in X_test:
            features = self.vectorize(text)
            y_pred = self.bias

            for word, count in features.items():
                y_pred += self.weights.get(word, 0) * count  # Default 0 if word not seen

            predictions.append(round(y_pred))  # Convert to class label
        return predictions


# Example Dataset (Text -> Binary Classification)
X_train = ["this is technology news", "latest AI model released", "new pet food launched", "dog health tips"]
y_train = [1, 1, 0, 0]  # 1 = Tech, 0 = Pet

X_test = ["AI is changing the world", "best dog food brands"]

# Train and Predict
model = LinearRegressionTextClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print("Predictions:", predictions)

Epoch 0, Loss: 1.980884139401
Epoch 100, Loss: 0.0005045594603809715
Epoch 200, Loss: 5.160036868090747e-07
Epoch 300, Loss: 6.827768543094441e-10
Epoch 400, Loss: 9.594638115615289e-13
Epoch 500, Loss: 1.3629372898843925e-15
Epoch 600, Loss: 1.939014718070554e-18
Epoch 700, Loss: 2.7586962643200962e-21
Epoch 800, Loss: 3.924380744815483e-24
Epoch 900, Loss: 5.5868831829822894e-27
Predictions: [1, 0]


Let's break down the math:

First, our prediction is computed as:

ŷ = bias + Σ(weight_i * count_i) for each word i
This is like the wx + b from simple linear regression, but now w is a vector of weights for each word


We use Mean Squared Error loss:

L = (ŷ - y)²


For each weight w_i corresponding to word i:

∂L/∂w_i = 2(ŷ - y) * ∂(ŷ)/∂w_i
∂(ŷ)/∂w_i = count_i (how many times word i appears)
Therefore: ∂L/∂w_i = 2(ŷ - y) * count_i
In code: error * count (note: the 2 is absorbed into learning rate)


For bias:

∂L/∂b = 2(ŷ - y) * ∂(ŷ)/∂b
∂(ŷ)/∂b = 1
Therefore: ∂L/∂b = 2(ŷ - y)
In code: error



The intuition:

Words that appear more frequently (higher count) have a larger impact on the gradient
If a word appears in a misclassified example, its weight gets adjusted more if it appeared multiple times
The error term (ŷ - y) determines the direction of the update
If prediction is too high (error > 0), weights decrease
If prediction is too low (error < 0), weights increase

This is essentially doing linear regression but with a high-dimensional sparse feature vector (bag of words) instead of a single feature.