In [5]:
import joblib
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
X = newsgroups.data
y = newsgroups.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Apply PCA to reduce dimensionality
pca = PCA(n_components=100)  # Adjust the number of components as needed
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a RandomForestClassifier on PCA-transformed data
model = RandomForestClassifier(random_state=42)
model.fit(X_train_pca, y_train)

# Evaluate the model
y_pred_train = model.predict(X_train_pca)
y_pred_test = model.predict(X_test_pca)

print("Accuracy on training set:", accuracy_score(y_train, y_pred_train))
print("Accuracy on test set:", accuracy_score(y_test, y_pred_test))

# Save the trained model
path = "model\pca\\"
joblib.dump(pca, path+'pca_model.joblib')
joblib.dump(vectorizer, path+'\\tfidf_vectorizer.joblib')

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred_test)
classification_rep = classification_report(
    y_test, y_pred_train, target_names=newsgroups.target_names)

# Print the results
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)


AttributeError: 'list' object has no attribute 'toarray'

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Extract the classification report data
report_data = classification_rep.split('\n')[2:-5]
labels = []
precision = []
recall = []
f1_score = []

for row in report_data:
    row_data = row.split()
    labels.append(row_data[0])
    precision.append(float(row_data[1]))
    recall.append(float(row_data[2]))
    f1_score.append(float(row_data[3]))

# Plotting precision, recall, and F1-score
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")
sns.barplot(x=labels, y=precision, color='blue', alpha=0.7, label='Precision')
sns.barplot(x=labels, y=recall, color='green', alpha=0.7, label='Recall')
sns.barplot(x=labels, y=f1_score, color='orange', alpha=0.7, label='F1-score')
plt.xticks(rotation=45)
plt.xlabel('Labels')
plt.ylabel('Score')
plt.title('Classification Report')
plt.legend()
plt.show()


NameError: name 'classification_rep' is not defined