In [None]:
import requests
import tarfile
import os

url = "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
output_path = "/content/news20.tar.gz"
extract_path = "/content/20_newsgroups"


print(f"Downloading {url}...")
try:
    response = requests.get(url, stream=True)
    response.raise_for_status()
    with open(output_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")


    print(f"Extracting {output_path} to {extract_path}...")
    with tarfile.open(output_path, 'r:gz') as tar:
        tar.extractall(path=extract_path)
    print("Extraction complete.")


    print("\nExtracted directories:")
    print(os.listdir(extract_path))

except requests.exceptions.RequestException as e:
    print(f"Error during download: {e}")
except tarfile.TarError as e:
    print(f"Error during extraction: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

Downloading http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz...
Download complete.
Extracting /content/news20.tar.gz to /content/20_newsgroups...


  tar.extractall(path=extract_path)


Extraction complete.

Extracted directories:
['20_newsgroup']


In [None]:
import os

data_dir = "/content/20_newsgroups"
newsgroups = []
texts = []

for category in os.listdir(data_dir):
    category_path = os.path.join(data_dir, category)
    if os.path.isdir(category_path):
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r', errors='ignore') as f:
                    texts.append(f.read())
                    newsgroups.append(category)

print(f"Loaded {len(texts)} documents.")
print(f"Number of newsgroup categories: {len(set(newsgroups))}")
print("First 10 newsgroups:", newsgroups[:10])

Loaded 0 documents.
Number of newsgroup categories: 0
First 10 newsgroups: []


In [None]:
print(os.listdir("/content/20_newsgroups"))

['20_newsgroup']


In [None]:
data_dir = "/content/20_newsgroups/20_newsgroup"
newsgroups = []
texts = []

for category in os.listdir(data_dir):
    category_path = os.path.join(data_dir, category)
    if os.path.isdir(category_path):
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r', errors='ignore') as f:
                    texts.append(f.read())
                    newsgroups.append(category)

print(f"Loaded {len(texts)} documents.")
print(f"Number of newsgroup categories: {len(set(newsgroups))}")
print("First 10 newsgroups:", newsgroups[:10])

Loaded 19997 documents.
Number of newsgroup categories: 20
First 10 newsgroups: ['comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics', 'comp.graphics']


In [None]:
unique_newsgroups = list(set(newsgroups))
print("Unique newsgroup categories:", unique_newsgroups)

Unique newsgroup categories: ['rec.autos', 'talk.religion.misc', 'talk.politics.misc', 'comp.sys.mac.hardware', 'sci.crypt', 'talk.politics.guns', 'alt.atheism', 'sci.med', 'talk.politics.mideast', 'misc.forsale', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'soc.religion.christian', 'sci.electronics', 'comp.os.ms-windows.misc', 'rec.sport.baseball', 'comp.graphics', 'sci.space', 'rec.sport.hockey', 'rec.motorcycles']


In [None]:
newsgroup_1 = 'sci.space'
newsgroup_2 = 'rec.autos'

texts_two_newsgroups = []
newsgroups_two_newsgroups = []

for i in range(len(texts)):
    if newsgroups[i] == newsgroup_1 or newsgroups[i] == newsgroup_2:
        texts_two_newsgroups.append(texts[i])
        newsgroups_two_newsgroups.append(newsgroups[i])

print(f"Loaded {len(texts_two_newsgroups)} documents for {newsgroup_1} and {newsgroup_2}.")

Loaded 2000 documents for sci.space and rec.autos.


In [None]:
from sklearn.model_selection import train_test_split

X_train_initial, X_test, y_train_initial, y_test = train_test_split(
    texts_two_newsgroups, newsgroups_two_newsgroups, test_size=0.1, stratify=newsgroups_two_newsgroups, random_state=42
)

X_train_labeled, X_train_unlabeled, y_train_labeled, y_train_unlabeled = train_test_split(
    X_train_initial, y_train_initial, test_size=0.75, stratify=y_train_initial, random_state=42
)

print(f"Initial training set size: {len(X_train_initial)}")
print(f"Test set size: {len(X_test)}")
print(f"Labeled training set size: {len(X_train_labeled)}")
print(f"Unlabeled training set size: {len(X_train_unlabeled)}")

Initial training set size: 1800
Test set size: 200
Labeled training set size: 450
Unlabeled training set size: 1350


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_labeled_tfidf = vectorizer.fit_transform(X_train_labeled)
X_train_unlabeled_tfidf = vectorizer.transform(X_train_unlabeled)
X_test_tfidf = vectorizer.transform(X_test)

print(f"Shape of Labeled Training TF-IDF matrix: {X_train_labeled_tfidf.shape}")
print(f"Shape of Unlabeled Training TF-IDF matrix: {X_train_unlabeled_tfidf.shape}")
print(f"Shape of Test TF-IDF matrix: {X_test_tfidf.shape}")

Shape of Labeled Training TF-IDF matrix: (450, 15544)
Shape of Unlabeled Training TF-IDF matrix: (1350, 15544)
Shape of Test TF-IDF matrix: (200, 15544)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelSpreading
import numpy as np

lr_model = LogisticRegression(max_iter=1000)

label_spreading_model = LabelSpreading(kernel='rbf', gamma=0.5)

X_train_combined_tfidf = np.vstack((X_train_labeled_tfidf.toarray(), X_train_unlabeled_tfidf.toarray()))

y_train_semi_supervised = np.concatenate((y_train_labeled, np.full(len(y_train_unlabeled), -1)))

unique_labels = np.unique(y_train_labeled)
label_mapping = {label: i for i, label in enumerate(unique_labels)}
y_train_semi_supervised_numeric = np.array([label_mapping.get(label, -1) for label in y_train_semi_supervised])


label_spreading_model.fit(X_train_combined_tfidf, y_train_semi_supervised_numeric)

predicted_labels_numeric = label_spreading_model.predict(X_train_combined_tfidf)

inverse_label_mapping = {i: label for label, i in label_mapping.items()}
predicted_labels = np.array([inverse_label_mapping[label] if label in inverse_label_mapping else 'unknown' for label in predicted_labels_numeric])


predicted_unlabeled_labels = predicted_labels[len(y_train_labeled):]

X_train_final = np.vstack((X_train_labeled_tfidf.toarray(), X_train_unlabeled_tfidf.toarray()))
y_train_final = np.concatenate((y_train_labeled, predicted_unlabeled_labels))

lr_model.fit(X_train_final, y_train_final)

print("Semi-supervised model trained.")

Semi-supervised model trained.


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = lr_model.predict(X_test_tfidf.toarray())

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6000

Classification Report:
              precision    recall  f1-score   support

   rec.autos       1.00      0.20      0.33       100
   sci.space       0.56      1.00      0.71       100

    accuracy                           0.60       200
   macro avg       0.78      0.60      0.52       200
weighted avg       0.78      0.60      0.52       200


Confusion Matrix:
[[ 20  80]
 [  0 100]]


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train_labeled_tfidf, y_train_labeled)

print("Naive Bayes model trained on labeled data.")

Naive Bayes model trained on labeled data.


In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred_lr = lr_model.predict(X_test_tfidf.toarray())

accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {accuracy_lr:.4f}")

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))

y_pred_nb = nb_model.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Naive Bayes Accuracy: {accuracy_nb:.4f}")

print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Logistic Regression Accuracy: 0.6000

Logistic Regression Classification Report:
              precision    recall  f1-score   support

   rec.autos       1.00      0.20      0.33       100
   sci.space       0.56      1.00      0.71       100

    accuracy                           0.60       200
   macro avg       0.78      0.60      0.52       200
weighted avg       0.78      0.60      0.52       200

Naive Bayes Accuracy: 0.9850

Naive Bayes Classification Report:
              precision    recall  f1-score   support

   rec.autos       1.00      0.97      0.98       100
   sci.space       0.97      1.00      0.99       100

    accuracy                           0.98       200
   macro avg       0.99      0.98      0.98       200
weighted avg       0.99      0.98      0.98       200

