Exp5 NLP basics with Built in stuff on colab


In [4]:
# =====================================
#NLP Classifier: Tech vs Sports News
# =====================================

# Step 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 2: Load two relevant categories from 20 Newsgroups
categories = ['comp.sys.mac.hardware', 'rec.sport.baseball']

news_data = fetch_20newsgroups(
    subset='all',
    categories=categories,
    shuffle=True,
    random_state=0,
    remove=('headers', 'footers', 'quotes')
)

print(f" Loaded {len(news_data.data)} documents across categories:")
print(f" Categories: {categories}")
print("\nSample article (first 300 characters):\n")
print(news_data.data[0][:300], "...\n")

# Step 3: Preprocess function
def clean_text(text):
    """Lowercase, remove short tokens and non-alphabetic content."""
    return ' '.join(word.lower() for word in text.split() if word.isalpha() and len(word) > 2)

# Check preprocessing on a sample
print("ðŸ§¹ Cleaned version of sample:")
print(clean_text(news_data.data[0])[:300], "...\n")

# Step 4: Vectorize using TF-IDF
vectorizer = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    preprocessor=clean_text
)
X = vectorizer.fit_transform(news_data.data)
y = news_data.target

# Step 5: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)
print(f"Data split: {X_train.shape[0]} training, {X_test.shape[0]} testing samples")

# Step 6: Train a Naive Bayes model
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

# Step 7: Evaluate the model
acc = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {acc:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=categories))

# Step 8: Show top keywords for each class
def display_top_words(clf, feature_names, labels, n=10):
    for i, label in enumerate(labels):
        top_ids = np.argsort(clf.feature_log_prob_[i])[-n:]
        top_terms = feature_names[top_ids]
        print(f"\n Top {n} words for category '{label}':")
        print(", ".join(top_terms))

feature_names = np.array(vectorizer.get_feature_names_out())
display_top_words(classifier, feature_names, categories)

# Step 9: Predict new articles
new_articles = [
    "The MacBook Pro M2 chip offers great performance for developers and creative professionals.",
    "The Yankees scored a grand slam in the bottom of the ninth inning to win the game."
]

new_vectors = vectorizer.transform(new_articles)
new_preds = classifier.predict(new_vectors)

print("\nNew Article Predictions:")
for text, pred in zip(new_articles, new_preds):
    print(f" Text: {text[:60]}...")
    print(f" Predicted Topic: {categories[pred]}\n")

print("NLP classification workflow completed.")


 Loaded 1957 documents across categories:
 Categories: ['comp.sys.mac.hardware', 'rec.sport.baseball']

Sample article (first 300 characters):

As a Philly fan as as a Penna. baseball fan, I'm anxious to see the
Penna. series.  Anyone know when it starts and where the first games
will be played?

This is (I think) always good baseball (to me); and the Pirates are
also off to a good start. ...

ðŸ§¹ Cleaned version of sample:
philly fan baseball anxious see the anyone know when starts and where the first games will this always good baseball and the pirates are also off good ...

Data split: 1369 training, 588 testing samples

Model Accuracy: 0.9541

Detailed Classification Report:
                       precision    recall  f1-score   support

comp.sys.mac.hardware       0.96      0.95      0.95       291
   rec.sport.baseball       0.95      0.96      0.95       297

             accuracy                           0.95       588
            macro avg       0.95      0.95      0.95     

viva questions
