In [7]:
import joblib
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=100)  # Adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train.toarray())
X_test_pca = pca.transform(X_test.toarray())

# Train a RandomForestClassifier on PCA-transformed data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_pca, y_train)

# Evaluate the model
y_pred_train = model.predict(X_train_pca)
y_pred_test = model.predict(X_test_pca)

print("Accuracy on training set:", accuracy_score(y_train, y_pred_train))
print("Accuracy on test set:", accuracy_score(y_test, y_pred_test))

# Save the trained model
joblib.dump(model, 'news_group_model_with_pca.joblib')


Accuracy on training set: 0.9703502255240116
Accuracy on test set: 0.5665782493368701


['news_group_model_with_pca.joblib']