In [None]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import requests
from io import StringIO
import numpy as np
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
import nltk
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer


import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Load dataset
news_data1 = fetch_20newsgroups(subset='all',remove = ('headers','footers','quotes') ,random_state=1)
print(news_data1.target_names)
cats = news_data1.target_names
news_data = fetch_20newsgroups(subset='all',remove = ('headers','footers','quotes'), categories=cats ,random_state=1)
print(news_data.target_names)
df = pd.DataFrame({'X': news_data.data, 'y': news_data.target})
df.head()



['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


18846

In [None]:
def process(text):
    # lowercase it
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    # return token list
    return text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['X'], df['y'], test_size=0.3, random_state=42)

In [None]:
count_vectorizer = CountVectorizer(analyzer=process)
X_train_countv = count_vectorizer.fit_transform(X_train)
X_test_countv = count_vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_countv.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_countv.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_countv.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.3800
Gaussian Naïve Bayes Training-set accuracy score: 0.9700
Confusion matrix

 [[ 5  0  0  0  0  0  0  0  0  0  1  0  0  1  1  2  5  0  7  2]
 [ 1  2  0  0  1  2  1  0  0  0  0  2  0  0  2  0  0  0  0  0]
 [ 0  1  6  1  1  2  0  1  0  0  0  3  0  0  2  0  0  0  0  0]
 [ 0  1  0  4  1  0  1  0  1  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  1  0  1  3  0  2  2  0  1  0  0  3  0  1  0  0  0  0  0]
 [ 0  1  0  0  0  5  1  0  0  0  0  1  3  0  0  0  2  0  0  0]
 [ 0  0  2  0  0  0  4  2  0  0  0  0  0  1  1  1  1  0  1  0]
 [ 0  0  1  0  1  0  0  4  1  0  0  1  1  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  2  4  2  0  0  2  1  0  1  1  2  0  2  1]
 [ 0  1  0  0  0  0  1  0  0  7  0  0  0  1  0  0  2  0  1  2]
 [ 0  0  0  1  0  0  0  0  0  0  7  0  0  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0  0  0  0 12  1  0  1  0  3  0  1  0]
 [ 0  0  1  0  0  0  3  1  2  0  0  2  6  0  5  0  2  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  7  1  1  8  0

In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_countv.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(9.999999999999999e-06)}
GaussianNB Accuracy: 0.38666666666666666


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_countv.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_countv.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_countv.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.4233
Multinomial Naïve Bayes Training-set accuracy score: 0.9314
Confusion matrix

 [[ 4  0  0  0  0  0  0  2  0  0  0  0  0  1  0  7  4  0  6  0]
 [ 0  4  0  0  1  1  0  0  0  0  0  1  1  0  2  0  0  0  1  0]
 [ 0  0  8  0  0  2  1  0  0  0  0  2  0  1  2  0  0  0  1  0]
 [ 0  1  0  7  1  0  0  0  0  0  0  0  1  0  0  0  1  0  0  0]
 [ 0  0  0  3  4  0  1  1  0  0  0  0  4  0  1  0  0  0  0  0]
 [ 1  1  0  0  1  7  0  0  0  0  0  2  0  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  5  2  0  0  1  0  1  0  1  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  5  0  0  0  0  1  0  2  0  1  0  1  0]
 [ 1  0  0  0  1  0  2  2  6  0  1  0  0  0  0  0  4  0  2  0]
 [ 0  0  0  0  0  0  1  1  0  2  7  0  0  0  0  0  0  0  4  0]
 [ 0  0  0  0  1  0  0  0  0  0  7  0  0  0  0  0  0  0  1  0]
 [ 0  0  1  0  0  0  0  0  0  0  0 12  1  0  0  0  3  0  2  0]
 [ 0  1  0  1  0  0  2  1  0  0  1  2  6  0  2  1  4  0  2  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  1  0  7  1  0

In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_countv.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(0.33333333333333304), 'fit_prior': True}
MultinomialNB Accuracy: 0.48333333333333334


In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_countv.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_countv.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_countv.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.1167
Bernoulli Naïve Bayes Training-set accuracy score: 0.2943
Confusion matrix

 [[ 0  0  0  0  0  0  4  0  0  0  0  0  0  0 18  0  2  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  9  0  0  0  0  1  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0  7  0  0  0  0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  1  0  8  0  0  0  0  0  0  0  5  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  0  0  0 11  0  0  0  0  0]
 [ 0  0  0  0  0  0 12  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0 11  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  8  0  0  0  0  0  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0  4  0  0  0  1  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0  0  0 13  0  0  0  0  0  0  0 10  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  1  0  1 13  0  0 

In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_countv.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(0.1), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.31333333333333335


#tfidf

In [None]:
vectorizer = TfidfVectorizer(analyzer=process)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_tfidf.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_tfidf.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.3667
Gaussian Naïve Bayes Training-set accuracy score: 0.9743
Confusion matrix

 [[ 6  0  0  0  0  0  0  0  0  0  1  0  0  0  1  4  3  2  5  2]
 [ 1  3  0  0  1  1  1  0  0  0  0  2  0  0  2  0  0  0  0  0]
 [ 0  1  3  2  1  3  0  1  0  0  0  2  0  1  3  0  0  0  0  0]
 [ 0  0  0  4  1  0  1  0  2  0  0  3  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  3  0  4  0  1  1  0  0  3  0  1  0  0  0  0  0]
 [ 0  3  0  0  0  6  1  0  0  0  0  2  1  0  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  3  2  0  0  0  0  1  1  1  1  1  0  1  0]
 [ 0  0  0  1  1  0  1  2  1  0  0  1  1  0  0  0  1  0  1  0]
 [ 0  0  1  0  0  0  2  3  2  0  0  2  0  0  3  1  2  0  2  1]
 [ 0  1  0  0  0  0  1  0  0  7  0  0  0  1  0  0  2  0  1  2]
 [ 0  0  0  0  1  0  1  0  0  0  7  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  0 11  0  0  2  0  4  0  1  0]
 [ 0  0  0  1  0  0  3  1  2  0  0  1  8  0  4  0  2  0  1  0]
 [ 0  0  0  0  0  0  1  0  0  0  0  0  0  7  1  1  7  0

In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_tfidf.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(0.01)}
GaussianNB Accuracy: 0.43666666666666665


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_tfidf.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_tfidf.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_tfidf.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_tfidf.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_tfidf.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.1167
Bernoulli Naïve Bayes Training-set accuracy score: 0.2943
Confusion matrix

 [[ 0  0  0  0  0  0  4  0  0  0  0  0  0  0 18  0  2  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  9  0  0  0  0  1  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0  7  0  0  0  0  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  1  0  8  0  0  0  0  0  0  0  5  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  0  0  0 11  0  0  0  0  0]
 [ 0  0  0  0  0  0 12  0  0  0  0  0  0  0  1  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0 11  0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  0  8  0  0  0  0  0  0  0  7  0  0  0  0  0]
 [ 0  0  0  0  0  0  4  0  0  0  1  0  0  0  4  0  0  0  0  0]
 [ 0  0  0  0  0  0  3  0  0  0  0  0  0  0 16  0  0  0  0  0]
 [ 0  0  0  0  0  0 13  0  0  0  0  0  0  0 10  0  0  0  0  0]
 [ 0  0  0  0  0  0  2  0  0  0  0  1  0  1 13  0  0 

In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_tfidf.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))