In [None]:
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from scipy.special import softmax
import random

# -------------------------
# Toy dataset (unlabeled docs)
# -------------------------
docs = [
    "The puck went into the goal",
    "He scored a goal in hockey",
    "The company profit increased",
    "New investment boosted profit",
    "Hockey players use a puck",
    "Financial analysts predict growth",
]

# Classes
labels = ["hockey", "finance"]

# -------------------------
# Step 1: Feature extraction
# -------------------------
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)
vocab = vectorizer.get_feature_names_out()
N, D = X.shape
print("Vocabulary:", vocab)

# -------------------------
# Step 2: Labeled features (domain knowledge)
# -------------------------
# Format: feature -> reference distribution over classes
# Example: puck strongly => hockey, profit strongly => finance
labeled_features = {
    "puck": np.array([0.95, 0.05]),      # mostly hockey
    "goal": np.array([0.9, 0.1]),
    "profit": np.array([0.05, 0.95]),    # mostly finance
}

# -------------------------
# Step 3: GE-FL loss function
# -------------------------
def ge_loss(weights, X, labeled_features, sigma=1.0):
    """
    GE-FL objective: sum of KL divergences for labeled features
    + Gaussian prior on weights
    """
    W = weights.reshape(D, len(labels))  # weight matrix (features x classes)
    loss = 0.0

    # For each labeled feature, compute model’s predicted distribution
    for feat, ref_dist in labeled_features.items():
        if feat not in vocab:
            continue
        j = np.where(vocab == feat)[0][0]  # feature index
        mask = X[:, j].toarray().ravel() > 0
        if mask.sum() == 0:
            continue

        # Subset of docs containing the feature
        X_feat = X[mask]
        scores = X_feat @ W
        probs = softmax(scores, axis=1)
        avg_pred = probs.mean(axis=0)  # model’s expected distribution

        # KL divergence between ref_dist and avg_pred
        kl = np.sum(ref_dist * (np.log(ref_dist + 1e-9) - np.log(avg_pred + 1e-9)))
        loss += kl

    # Gaussian prior on weights
    loss += np.sum(W**2) / (2 * sigma**2)
    return loss

# -------------------------
# Step 4: Training with gradient descent
# -------------------------
def train_gefl(X, labeled_features, vocab, lr=0.5, epochs=200):
    W = np.random.randn(X.shape[1], len(labels)) * 0.01

    for epoch in range(epochs):
        # Simple gradient approximation (finite difference, for clarity)
        loss = ge_loss(W, X, labeled_features)
        grad = np.zeros_like(W)
        eps = 1e-4
        for i in range(W.shape[0]):
            for j in range(W.shape[1]):
                W[i, j] += eps
                l2 = ge_loss(W, X, labeled_features)
                W[i, j] -= eps
                grad[i, j] = (l2 - loss) / eps
        W -= lr * grad
        if epoch % 50 == 0:
            print(f"Epoch {epoch}, Loss={loss:.4f}")
    return W

W = train_gefl(X, labeled_features, vocab)

# -------------------------
# Step 5: Predictions
# -------------------------
def predict(doc):
    vec = vectorizer.transform([doc])
    scores = vec @ W
    probs = softmax(scores, axis=1)[0]
    return {c: round(p, 3) for c, p in zip(labels, probs)}

print("\nPredictions:")
print("Doc: 'Hockey match with a puck' =>", predict("Hockey match with a puck"))
print("Doc: 'Company profit report' =>", predict("Company profit report"))

In [25]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from scipy.special import softmax

# -------------------------
# Step 1: Load dataset
# -------------------------
categories = ['rec.sport.hockey', 'sci.electronics']
newsgroups = fetch_20newsgroups(
    subset='train',
    categories=categories,
    remove=('headers', 'footers', 'quotes')
)
docs = newsgroups.data
labels = categories
print(f"Loaded {len(docs)} documents from categories: {labels}")

# -------------------------
# Step 2: Feature extraction
# -------------------------
vectorizer = CountVectorizer(max_features=2000, stop_words='english')
X = vectorizer.fit_transform(docs)
vocab = vectorizer.get_feature_names_out()
N, D = X.shape
print("Vocabulary size:", D)

# -------------------------
# Step 3: Labeled features (domain knowledge)
# -------------------------
labeled_features = {
    "puck": np.array([0.95, 0.05]),
    "goal": np.array([0.9, 0.1]),
    "hockey": np.array([0.95, 0.05]),
    "team": np.array([0.8, 0.2]),
    "stick": np.array([0.9, 0.1]),
    "circuit": np.array([0.1, 0.9]),
    "voltage": np.array([0.05, 0.95]),
    "chip": np.array([0.05, 0.95]),
    "board": np.array([0.2, 0.8]),
    "resistor": np.array([0.05, 0.95]),
}

# -------------------------
# Step 4: GE-FL loss + gradient
# -------------------------
def ge_loss_and_grad(W, X, labeled_features, sigma=1.0):
    W = W.reshape(D, len(labels))
    loss = 0.0
    grad = np.zeros_like(W)

    for feat, ref_dist in labeled_features.items():
        if feat not in vocab:
            continue

        j = np.where(vocab == feat)[0][0]
        mask = X[:, j].toarray().ravel() > 0
        if mask.sum() == 0:
            continue

        X_feat = X[mask].toarray()  # convert to dense to simplify
        scores = X_feat @ W  # (num_docs_with_feat, num_classes)
        probs = softmax(scores, axis=1)
        avg_pred = probs.mean(axis=0)

        # KL Divergence (ref || pred)
        kl = np.sum(ref_dist * (np.log(ref_dist + 1e-9) - np.log(avg_pred + 1e-9)))
        loss += kl

        # Gradient (approximation)
        diff = (avg_pred - ref_dist)[None, :]  # shape (1, num_classes)
        grad_contrib = (X_feat.T @ np.tile(diff, (X_feat.shape[0], 1))) / X_feat.shape[0]
        grad += grad_contrib

    # Gaussian prior
    loss += np.sum(W**2) / (2 * sigma**2)
    grad += W / (sigma**2)

    return loss, grad

# -------------------------
# Step 5: Training (gradient descent)
# -------------------------
def train_gefl(X, labeled_features, lr=0.1, epochs=100):
    W = np.random.randn(X.shape[1], len(labels)) * 0.01
    for epoch in range(epochs):
        loss, grad = ge_loss_and_grad(W, X, labeled_features)
        W -= lr * grad
        if epoch % 10 == 0:
            print(f"Epoch {epoch}, Loss={loss:.4f}")
    return W

W = train_gefl(X, labeled_features, lr=0.1, epochs=100)

# -------------------------
# Step 6: Prediction
# -------------------------
def predict(doc):
    vec = vectorizer.transform([doc])
    scores = vec @ W
    probs = softmax(scores, axis=1)[0]
    return {c: round(p, 3) for c, p in zip(labels, probs)}

# -------------------------
# Step 7: Example predictions
# -------------------------
print("\nPredictions:")
print("Doc: 'The hockey puck hit the goal' =>", predict("The hockey puck hit the goal"))
print("Doc: 'The voltage in the circuit is high' =>", predict("The voltage in the circuit is high"))


Loaded 1191 documents from categories: ['rec.sport.hockey', 'sci.electronics']
Vocabulary size: 2000
Epoch 0, Loss=4.2806
Epoch 10, Loss=2.3769
Epoch 20, Loss=2.3725
Epoch 30, Loss=2.5510
Epoch 40, Loss=2.5296
Epoch 50, Loss=2.4130
Epoch 60, Loss=2.4343
Epoch 70, Loss=2.5680
Epoch 80, Loss=2.5284
Epoch 90, Loss=2.4141

Predictions:
Doc: 'The hockey puck hit the goal' => {'rec.sport.hockey': np.float64(0.836), 'sci.electronics': np.float64(0.164)}
Doc: 'The voltage in the circuit is high' => {'rec.sport.hockey': np.float64(0.281), 'sci.electronics': np.float64(0.719)}
