In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

import numpy as np

# Load the dataset
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
print(twenty_train.target_names) #prints all the categories


['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
# Extract features from text files
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape) #prints the number of documents and the number of words in the vocabulary

(11314, 130107)


In [3]:
# Compute the tf-idf
# tf-idf is a statistical measure that evaluates how relevant a word is to a document in a collection of documents

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape) #prints the number of documents and the number of words in the vocabulary

(11314, 130107)


In [27]:
# Train a classifier
# We will use a Multi-layer Perceptron classifier

MLPClassifier(hidden_layer_sizes=(100, 50, 10), max_iter=100, alpha=0.0001,
                          solver='adam', verbose=10, tol=0.0001, random_state=42,
                            learning_rate_init=.001,
                            activation='relu', learning_rate='constant')
                          

mlp.fit(X_train_tfidf, twenty_train.target)




MLPClassifier(hidden_layer_sizes=(100, 100), learning_rate_init=0.0001)

In [28]:
# test the classifier
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = mlp.predict(count_vect.transform(twenty_test.data))
print(np.mean(predicted == twenty_test.target))
accuracy = mlp.score(count_vect.transform(twenty_test.data), twenty_test.target)
print(accuracy)

0.784784917684546
0.784784917684546


In [None]:
# Grid search

parameters = { 'hidden_layer_sizes': [(100, 100 ), (100, 100, 100 ), (100, 100, 100, 100 )],
                'solver': ['adam', 'sgd' ],
                'alpha': [0.0001, 0.001, 0.01],
                'learning_rate': ['constant'],
                'learning_rate_init': [0.001, 0.01, 0.1],
                'max_iter': [200],
                'shuffle': [True],
                'random_state': [None],
                'tol': [0.0001],
                'verbose': [False]
              }

mlp = MLPClassifier()

clf = GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
clf.fit(X_train_tfidf, twenty_train.target)
print("bes parameters: ", clf.best_params_)
print("best score: ", clf.best_score_)
print("best estimator: ", clf.best_estimator_)
print("best index: ", clf.best_index_)
print("scorer: ", clf.scorer_)

# test the classifier
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)
predicted = clf.predict(count_vect.transform(twenty_test.data))
print(np.mean(predicted == twenty_test.target))

In [None]:
# Confusion matrix
 
cm = confusion_matrix(twenty_test.target, predicted)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(20,20))
sns.heatmap(cm, annot=True, fmt="d")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

sns.heatmap(cm, annot=True, fmt="d", cmap="YlGnBu")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load the data
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
train_data = fetch_20newsgroups(subset='train', categories=categories)
test_data = fetch_20newsgroups(subset='test', categories=categories)

# Create a pipeline for preprocessing and training the model
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
                                 solver='adam', verbose=10, random_state=1)),
])

# Train the model
pipeline.fit(train_data.data, train_data.target)

# Evaluate the model on the test set
y_pred = pipeline.predict(test_data.data)
accuracy = accuracy_score(test_data.target, y_pred)
print(f"Accuracy: {accuracy:.4f}")


In [24]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

# Load the 20newsgroups dataset
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Create a pipeline with TfidfVectorizer and MLPClassifier
clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mlp', MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=0.0001,
                          solver='adam', verbose=10, tol=0.0001, random_state=42))
])

# Fit the pipeline to the training data
clf.fit(newsgroups_train.data, newsgroups_train.target)

# Evaluate the pipeline on the test data
accuracy = clf.score(newsgroups_test.data, newsgroups_test.target)

print(f"Accuracy: {accuracy}")

"""In this code, we first load the 20newsgroups dataset using fetch_20newsgroups function from 
sklearn.datasets. We then create a pipeline that consists of TfidfVectorizer for text feature 
extraction and MLPClassifier for classification.

We use a hidden layer size of 50, max iteration of 50,
 and an alpha value of 0.0001. We set the solver to 'adam',
   which is a stochastic gradient-based optimizer, and verbose to 10 to 
   get updates on the training process. We set the random state to 42 
   for reproducibility.

We then fit the pipeline to the training data and evaluate the pipeline 
on the test data using the score method. This code should achieve an accuracy 
of over 90% without extracting categories."""

Iteration 1, loss = 2.82373447
Iteration 2, loss = 2.15334580
Iteration 3, loss = 1.34884726
Iteration 4, loss = 0.75890099
Iteration 5, loss = 0.43937494
Iteration 6, loss = 0.27393795
Iteration 7, loss = 0.18289557
Iteration 8, loss = 0.12883565
Iteration 9, loss = 0.09503590
Iteration 10, loss = 0.07289065
Iteration 11, loss = 0.05779304
Iteration 12, loss = 0.04726972
Iteration 13, loss = 0.03950707
Iteration 14, loss = 0.03383328
Iteration 15, loss = 0.02949551
Iteration 16, loss = 0.02612945
Iteration 17, loss = 0.02348888
Iteration 18, loss = 0.02139017
Iteration 19, loss = 0.01964030
Iteration 20, loss = 0.01807365
Iteration 21, loss = 0.01691341
Iteration 22, loss = 0.01584750
Iteration 23, loss = 0.01499103
Iteration 24, loss = 0.01414177
Iteration 25, loss = 0.01351790
Iteration 26, loss = 0.01294783
Iteration 27, loss = 0.01237512
Iteration 28, loss = 0.01200344
Iteration 29, loss = 0.01151112
Iteration 30, loss = 0.01114679
Iteration 31, loss = 0.01090221
Iteration 32, los



Accuracy: 0.8587360594795539


"In this code, we first load the 20newsgroups dataset using fetch_20newsgroups function from \nsklearn.datasets. We then create a pipeline that consists of TfidfVectorizer for text feature \nextraction and MLPClassifier for classification.\n\nWe use a hidden layer size of 50, max iteration of 50,\n and an alpha value of 0.0001. We set the solver to 'adam',\n   which is a stochastic gradient-based optimizer, and verbose to 10 to \n   get updates on the training process. We set the random state to 42 \n   for reproducibility.\n\nWe then fit the pipeline to the training data and evaluate the pipeline \non the test data using the score method. This code should achieve an accuracy \nof over 90% without extracting categories."