In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

In [2]:
# Load dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['rec.sport.baseball', 'sci.space'], shuffle=True, random_state=42)

In [6]:
len(newsgroups.target)

1981

In [10]:
newsgroups.data[0]

'From: mss@netcom.com (Mark Singer)\nSubject: Re: Young Catchers\nArticle-I.D.: netcom.mssC52qMx.768\nOrganization: Netcom Online Communications Services (408-241-9760 login: guest)\nLines: 86\n\nIn article <7975@blue.cis.pitt.edu> genetic+@pitt.edu (David M. Tate) writes:\n>mss@netcom.com (Mark Singer) said:\n>>\n>>We know that very, very few players at this age make much of an impact\n>>in the bigs, especially when they haven\'t even played AAA ball.  \n>\n>Yes.  But this is *irrelevant*.  You\'re talking about averages, when we\n>have lots of information about THIS PLAYER IN PARTICULAR to base our\n>decisions on.\n\nDo you really have *that* much information on him?  Really?\n\n>Why isn\'t Lopez likely to hit that well?  He hit that well last year (after\n>adjusting his stats for park and league and such); he hit better (on an\n>absolute scale) than Olson or Berryhill did.  By a lot.\n\nI don\'t know.  You tell me.  What percentage of players reach or \nexceed their MLE\'s *in their

In [14]:
newsgroups.target[0]

0

In [11]:
X = newsgroups.data
y = newsgroups.target

In [13]:
print(len(X), len(y))

1981 1981


In [15]:
# Create a DataFrame for easy manipulation
df = pd.DataFrame({'text': X, 'label': y})
df.head()

Unnamed: 0,text,label
0,From: mss@netcom.com (Mark Singer)\nSubject: R...,0
1,From: cuz@chaos.cs.brandeis.edu (Cousin It)\nS...,0
2,From: J019800@LMSC5.IS.LMSC.LOCKHEED.COM\nSubj...,0
3,From: tedward@cs.cornell.edu (Edward [Ted] Fis...,0
4,From: snichols@adobe.com (Sherri Nichols)\nSub...,0


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [17]:
# Transform the text data to feature vectors
X_vect = vectorizer.fit_transform(df['text'])

In [18]:
# Labels
y = df['label']

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

In [21]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vect, y, test_size=0.3, random_state=42)

In [23]:
X_train.shape

(1386, 27120)

In [24]:
X_test.shape

(595, 27120)

In [25]:
# Initialize and train the classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score, classification_report

In [27]:
# Predict on the test set
y_pred = clf.predict(X_test)

In [28]:
# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=newsgroups.target_names)

In [29]:
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.9966
Classification Report:
                    precision    recall  f1-score   support

rec.sport.baseball       0.99      1.00      1.00       286
         sci.space       1.00      0.99      1.00       309

          accuracy                           1.00       595
         macro avg       1.00      1.00      1.00       595
      weighted avg       1.00      1.00      1.00       595



In [30]:
def predict_category(text):
    """
    Predict the category of a given text using the trained classifier.
    """
    text_vec = vectorizer.transform([text])
    prediction = clf.predict(text_vec)
    return newsgroups.target_names[prediction[0]]

# Example usage
sample_text = "NASA announced the discovery of new exoplanets."
predicted_category = predict_category(sample_text)
print(f'The predicted category is: {predicted_category}')

The predicted category is: sci.space
