In [332]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import nltk
from nltk.corpus import stopwords
import re

In [333]:
df=pd.read_csv('spam.csv', encoding='latin-1')
df=df[['v1','v2']]
df=df.rename(columns={'v1':'label','v2':'text'})

In [334]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [335]:
df.describe()

Unnamed: 0,label,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [336]:
df.isna().sum()

label    0
text     0
dtype: int64

In [337]:
df.duplicated().sum()

403

In [338]:
df=df.drop_duplicates()

In [339]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [340]:
nltk.download('stopwords')
stop_words=set(stopwords.words('english'))

def clean_text(text):
    text=re.sub(r'\W',' ',text) #remove all special characters
    text=text.lower() #convert to lover case
    text=re.sub(r'\s+[a-zA-Z]\s+',' ',text)#remove single characters
    text=re.sub(r'\^[a-zA-Z]\s+',' ',text)#remove single characters from strat
    text=re.sub(r'\s+',' ',text) #remove multiple spaces
    text=text.strip()
    text=' '.join([word for word in text.split() if word not in stop_words]) # remove stopwords
    return text

df['text']=df['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niranjansmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [341]:
df['label'] = df['label'].map({'spam':1, 'ham':0})

In [342]:
x=df['text']
y=df['label']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

1.Using Tfidf vectorizer

In [343]:
#initializing tfidf vectorizer
tfidf = TfidfVectorizer(max_features=3000)

#transform the textdata
x_train_tfidf =tfidf.fit_transform(x_train).toarray()
x_test_tfidf = tfidf.transform(x_test).toarray()

In [344]:
x_train_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [345]:
x_test_tfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [346]:
#initializing naive bayes clasifier
model=MultinomialNB()

model.fit(x_train_tfidf, y_train)

In [347]:
y_pred = model.predict(x_test_tfidf)

accuracy = accuracy_score(y_test,y_pred)
conf_matrix = confusion_matrix(y_test,y_pred)
classification_re = classification_report(y_test,y_pred)

In [348]:
print(f"Accuracy: {accuracy:.2f}")
print(f"\nConfusion Matrix:\n", conf_matrix)
print(f"\nClassification Report:\n", classification_re)

Accuracy: 0.98

Confusion Matrix:
 [[1330    1]
 [  37  183]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1331
           1       0.99      0.83      0.91       220

    accuracy                           0.98      1551
   macro avg       0.98      0.92      0.95      1551
weighted avg       0.98      0.98      0.97      1551



2.using count vectorizer

In [349]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_features=3000)
x_train_counts = count_vectorizer.fit_transform(x_train).toarray()
x_test_counts = count_vectorizer.transform(x_test).toarray()


In [350]:
x_train_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [351]:
x_test_counts

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [352]:
#initializing naive bayes clasifier
model_1=MultinomialNB()

model_1.fit(x_train_counts, y_train)

In [353]:
y_pred_1 = model_1.predict(x_test_tfidf)

c_accuracy = accuracy_score(y_test,y_pred_1)
c_conf_matrix = confusion_matrix(y_test,y_pred_1)
c_classification_re = classification_report(y_test,y_pred_1)

print(f"Accuracy: {c_accuracy:.2f}")
print(f"\nConfusion Matrix:\n", c_conf_matrix)
print(f"\nClassification Report:\n", c_classification_re)

Accuracy: 0.98

Confusion Matrix:
 [[1325    6]
 [  26  194]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1331
           1       0.97      0.88      0.92       220

    accuracy                           0.98      1551
   macro avg       0.98      0.94      0.96      1551
weighted avg       0.98      0.98      0.98      1551

