In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.semi_supervised import SelfTrainingClassifier
from sklearn.metrics import accuracy_score

# =========================
# 1. Load 20 Newsgroups dataset
# =========================
categories = [
    'sci.space', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.mideast'
]  # smaller subset for clarity and speed

data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_text = data.data
y = data.target

# =========================
# 2. TF-IDF Vectorization
# =========================
# Use n-grams and increase max_features for better text representation
vectorizer = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(X_text)

# =========================
# 3. Split labeled and unlabeled data
# =========================
# Use 20% labeled, 80% unlabeled
X_labeled, X_unlabeled, y_labeled, y_unlabeled = train_test_split(
    X, y, test_size=0.80, random_state=42, stratify=y
)

# Create a separate test set from labeled portion
X_train, X_test, y_train, y_test = train_test_split(
    X_labeled, y_labeled, test_size=0.25, random_state=42, stratify=y_labeled
)

# Combine labeled + unlabeled
y_train_full = np.concatenate([y_train, np.full(y_unlabeled.shape, -1, dtype=int)])
X_train_full = np.vstack([X_train.toarray(), X_unlabeled.toarray()])

print(f"Total docs: {X.shape[0]}")
print(f"Labeled docs: {X_train.shape[0]}")
print(f"Unlabeled docs: {X_unlabeled.shape[0]}")
print(f"Test docs: {X_test.shape[0]}")

# =========================
# 4. Dimensionality Reduction + Classifier Pipeline
# =========================
# Reduce high-dimensional TF-IDF using SVD (LSA)
svd = TruncatedSVD(n_components=300, random_state=42)

# Logistic regression (probability output helps SelfTraining)
base_clf = make_pipeline(svd, LogisticRegression(max_iter=2000, solver='lbfgs', n_jobs=-1))

# =========================
# 5. Semi-supervised training
# =========================
# Lower threshold â†’ more aggressive pseudo-labeling
semi_supervised_model = SelfTrainingClassifier(base_clf, threshold=0.6, max_iter=15, verbose=True)
semi_supervised_model.fit(X_train_full, y_train_full)

# =========================
# 6. Evaluation
# =========================
y_pred = semi_supervised_model.predict(X_test)
semi_acc = accuracy_score(y_test, y_pred)
print(f"\nâœ… Semi-supervised model accuracy: {semi_acc:.4f}")

# =========================
# 7. Baseline (supervised only)
# =========================
supervised_model = make_pipeline(svd, LogisticRegression(max_iter=2000, solver='lbfgs', n_jobs=-1))
supervised_model.fit(X_train, y_train)
y_pred_sup = supervised_model.predict(X_test)
sup_acc = accuracy_score(y_test, y_pred_sup)
print(f"ðŸ§  Supervised-only accuracy (small labeled set): {sup_acc:.4f}")

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# Load dataset
# -----------------------------
categories = ['sci.space', 'comp.graphics', 'rec.sport.baseball', 'talk.politics.mideast']
data = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))
X_text, y, target_names = data.data, data.target, data.target_names

# -----------------------------
# TF-IDF representation
# -----------------------------
vectorizer = TfidfVectorizer(max_features=8000, stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(X_text)
feature_names = np.array(vectorizer.get_feature_names_out())

# -----------------------------
# Define labeled features (weak supervision)
# -----------------------------
# Manually define a few indicative words for each class
labeled_keywords = {
    "sci.space": ["space", "nasa", "orbit", "rocket", "planet"],
    "comp.graphics": ["graphics", "image", "3d", "software", "animation"],
    "rec.sport.baseball": ["baseball", "pitcher", "team", "game", "league"],
    "talk.politics.mideast": ["israel", "arab", "war", "palestinian", "peace"]
}

# Map words to indices in vectorizer vocabulary
keyword_to_class = {}
for cls, words in labeled_keywords.items():
    for w in words:
        if w in vectorizer.vocabulary_:
            keyword_to_class[vectorizer.vocabulary_[w]] = cls

# -----------------------------
# Generate pseudo-labels using feature presence
# -----------------------------
pseudo_labels = np.full(len(X_text), fill_value=-1, dtype=int)
for i in range(X.shape[0]):
    feature_indices = X[i].nonzero()[1]
    classes_hit = [keyword_to_class[idx] for idx in feature_indices if idx in keyword_to_class]
    if len(classes_hit) > 0:
        # majority vote of keyword classes
        predicted_class = max(set(classes_hit), key=classes_hit.count)
        pseudo_labels[i] = target_names.index(predicted_class)

# Keep only pseudo-labeled examples
mask = pseudo_labels != -1
X_weak, y_weak = X[mask], pseudo_labels[mask]
print(f"Generated pseudo-labeled samples: {X_weak.shape[0]} out of {X.shape[0]}")

# -----------------------------
# Train supervised model on pseudo-labeled data
# -----------------------------
clf_weak = LogisticRegression(max_iter=2000, solver='lbfgs')
clf_weak.fit(X_weak, y_weak)

# Evaluate on real labels
y_pred_weak = clf_weak.predict(X)
accuracy_weak = accuracy_score(y, y_pred_weak)
print(f"ðŸ§© Weak Supervision Accuracy: {accuracy_weak:.4f}")



Total docs: 3894
Labeled docs: 583
Unlabeled docs: 3116
Test docs: 195
End of iteration 1, added 501 new labels.
End of iteration 2, added 591 new labels.
End of iteration 3, added 339 new labels.
End of iteration 4, added 148 new labels.
End of iteration 5, added 60 new labels.
End of iteration 6, added 23 new labels.
End of iteration 7, added 23 new labels.
End of iteration 8, added 12 new labels.
End of iteration 9, added 6 new labels.
End of iteration 10, added 5 new labels.
End of iteration 11, added 6 new labels.
End of iteration 12, added 6 new labels.
End of iteration 13, added 3 new labels.
End of iteration 14, added 1 new labels.
End of iteration 15, added 3 new labels.

âœ… Semi-supervised model accuracy: 0.7949
ðŸ§  Supervised-only accuracy (small labeled set): 0.8205
Generated pseudo-labeled samples: 1921 out of 3894
ðŸ§© Weak Supervision Accuracy: 0.8570
