In [None]:
import pandas as pd
import requests
from io import StringIO
import numpy as np
import string
from nltk.corpus import stopwords
from nltk import PorterStemmer as Stemmer
import nltk
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer


import warnings
warnings.filterwarnings('ignore')


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
url = "https://github.com/mujahidashraf/data/blob/main/spam.csv?raw=true"
response = requests.get(url)
response.raise_for_status()  # Ensure we notice bad responses (e.g., 404 or 403)
df = pd.read_csv(StringIO(response.text), encoding='latin1')
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.drop(columns=['Unnamed: 2',	'Unnamed: 3'	,'Unnamed: 4'],inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.rename(columns={'v1':'y',	'v2':'X'},inplace=True)
df.head()

Unnamed: 0,y,X
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
def process(text):
    # lowercase it
    text = text.lower()
    # remove punctuation
    text = ''.join([t for t in text if t not in string.punctuation])
    # remove stopwords
    text = [t for t in text.split() if t not in stopwords.words('english')]
    # stemming
    st = Stemmer()
    text = [st.stem(t) for t in text]
    # return token list
    return text

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['X'], df['y'], test_size=0.2, random_state=42)

#all naive bayes model with count vectrizer (bag of words)

In [None]:
count_vectorizer = CountVectorizer(analyzer=process)
X_train_countv = count_vectorizer.fit_transform(X_train)
X_test_countv = count_vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_countv.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_countv.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_countv.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.8789
Gaussian Naïve Bayes Training-set accuracy score: 0.9396
Confusion matrix

 [[843 122]
 [ 13 137]]
              precision    recall  f1-score   support

         ham       0.98      0.87      0.93       965
        spam       0.53      0.91      0.67       150

    accuracy                           0.88      1115
   macro avg       0.76      0.89      0.80      1115
weighted avg       0.92      0.88      0.89      1115



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_countv.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(0.01)}
GaussianNB Accuracy: 0.9820627802690582


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_countv.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_countv.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_countv.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.9812
Multinomial Naïve Bayes Training-set accuracy score: 0.9930
Confusion matrix

 [[961   4]
 [ 17 133]]
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.97      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
para=mnb.get_params()
para

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}

In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_countv.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(1.0), 'fit_prior': True}
MultinomialNB Accuracy: 0.9811659192825112


In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_countv.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_countv.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_countv.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.9704
Bernoulli Naïve Bayes Training-set accuracy score: 0.9845
Confusion matrix

 [[964   1]
 [ 32 118]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       0.99      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_countv.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_countv.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(0.1), 'binarize': np.float64(0.0)}
BernoulliNB Accuracy: 0.9874439461883409


In [None]:
np.linspace(0.1, 10, 10)

array([ 0.1,  1.2,  2.3,  3.4,  4.5,  5.6,  6.7,  7.8,  8.9, 10. ])

#with tfidf

In [None]:
vectorizer = TfidfVectorizer(analyzer=process)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_gnb = gnb.predict(X_test_tfidf.toarray())#
print('Gaussian Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gnb)))
y_pred_train = gnb.predict(X_train_tfidf.toarray())
print('Gaussian Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_gnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_gnb))

Gaussian Naïve Bayes Model accuracy score: 0.8762
Gaussian Naïve Bayes Training-set accuracy score: 0.9396
Confusion matrix

 [[842 123]
 [ 15 135]]
              precision    recall  f1-score   support

         ham       0.98      0.87      0.92       965
        spam       0.52      0.90      0.66       150

    accuracy                           0.88      1115
   macro avg       0.75      0.89      0.79      1115
weighted avg       0.92      0.88      0.89      1115



In [None]:
param_grid_gnb = {'var_smoothing': np.logspace(-9, 0, 10)}
grid_gnb = GridSearchCV(GaussianNB(), param_grid_gnb, cv=3, scoring='accuracy')
grid_gnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best GaussianNB params:", grid_gnb.best_params_)
y_pred_gnb = grid_gnb.best_estimator_.predict(X_test_tfidf.toarray())
print("GaussianNB Accuracy:", accuracy_score(y_test, y_pred_gnb))

Best GaussianNB params: {'var_smoothing': np.float64(1.0)}
GaussianNB Accuracy: 0.9695067264573991


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_mnb = mnb.predict(X_test_tfidf.toarray())
print('Multinomial Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_mnb)))
y_pred_train = mnb.predict(X_train_tfidf.toarray())
print('Multinomial Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_mnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_mnb))

Multinomial Naïve Bayes Model accuracy score: 0.9623
Multinomial Naïve Bayes Training-set accuracy score: 0.9740
Confusion matrix

 [[965   0]
 [ 42 108]]
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [None]:
param_grid_mnb = {'alpha': np.linspace(-2, 1, 10), 'fit_prior': [True, False]}
grid_mnb = GridSearchCV(MultinomialNB(), param_grid_mnb, cv=3, scoring='accuracy')
grid_mnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best MultinomialNB params:", grid_mnb.best_params_)
y_pred_mnb = grid_mnb.best_estimator_.predict(X_test_tfidf.toarray())
print("MultinomialNB Accuracy:", accuracy_score(y_test, y_pred_mnb))

Best MultinomialNB params: {'alpha': np.float64(0.33333333333333304), 'fit_prior': True}
MultinomialNB Accuracy: 0.97847533632287


In [None]:
bnb = BernoulliNB()
bnb.fit(X_train_tfidf.toarray(), y_train)
y_pred_bnb = bnb.predict(X_test_tfidf.toarray())
print('Bernoulli Naïve Bayes Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_bnb)))
y_pred_train = bnb.predict(X_train_tfidf.toarray())
print('Bernoulli Naïve Bayes Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train)))
cm = confusion_matrix(y_test, y_pred_bnb)

print('Confusion matrix\n\n', cm)
print(classification_report(y_test, y_pred_bnb))

Bernoulli Naïve Bayes Model accuracy score: 0.9704
Bernoulli Naïve Bayes Training-set accuracy score: 0.9845
Confusion matrix

 [[964   1]
 [ 32 118]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       0.99      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [None]:
param_grid_bnb = {'alpha': np.linspace(0.1, 10, 10), 'binarize': np.linspace(0.0, 1.0, 10)}
grid_bnb = GridSearchCV(BernoulliNB(), param_grid_bnb, cv=3, scoring='accuracy')
grid_bnb.fit(X_train_tfidf.toarray(), y_train)

In [None]:
print("Best BernoulliNB params:", grid_bnb.best_params_)
y_pred_bnb = grid_bnb.best_estimator_.predict(X_test_tfidf.toarray())
print("BernoulliNB Accuracy:", accuracy_score(y_test, y_pred_bnb))

Best BernoulliNB params: {'alpha': np.float64(0.1), 'binarize': np.float64(0.1111111111111111)}
BernoulliNB Accuracy: 0.9874439461883409
