# Naive Bayes for Text Classification

In [1]:
# Import libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
# Load text data (subset for clarity)
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories, remove=('headers', 'footers', 'quotes'))

print("Training samples:", len(newsgroups_train.data))
print("Test samples:", len(newsgroups_test.data))
print("\nExample text:\n", newsgroups_train.data[0][:500])


TypeError: TarFile.extractall() got an unexpected keyword argument 'filter'

In [None]:
# Create a pipeline: CountVectorizer + TfidfTransformer + MultinomialNB
text_clf = Pipeline([
    ('vect', CountVectorizer()),          # Convert text to word counts
    ('tfidf', TfidfTransformer()),        # Convert counts to TF-IDF
    ('clf', MultinomialNB()),             # Naive Bayes classifier
])

# Train the model
text_clf.fit(newsgroups_train.data, newsgroups_train.target)

# Predict on the test set
y_pred = text_clf.predict(newsgroups_test.data)

# Evaluate
accuracy = accuracy_score(newsgroups_test.target, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(newsgroups_test.target, y_pred, target_names=newsgroups_test.target_names))

print("\nConfusion Matrix:")
print(confusion_matrix(newsgroups_test.target, y_pred))

# 7️⃣ Predict on a new example
sample_text = ["I believe religion and science can coexist without conflict."]
predicted = text_clf.predict(sample_text)
print(f"\nPredicted category: {newsgroups_train.target_names[predicted[0]]}")
