In [50]:
!pip install scikit-learn



In [51]:
import sklearn

In [52]:
# Import necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, roc_auc_score
import nltk
from nltk.corpus import stopwords
from collections import Counter

# Libraries for visualisation

import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
#import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [54]:
#import data
spam_df = pd.read_csv("spamclass.csv")

In [55]:
                                                         # Data Preprocessing
# Importing the Porter Stemmer for text stemming
from nltk.stem.porter import PorterStemmer

# Importing the string module for handling special characters
import string

# Creating an instance of the Porter Stemmer
ps = PorterStemmer()

# Lowercase transformation and text preprocessing function
def transform_text(text):
    # Transform the text to lowercase
    text = text.lower()
    
    # Tokenization using NLTK
    text = nltk.word_tokenize(text)
    # Removing special characters
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
            
    # Removing stop words and punctuation
    text = y[:]
    y.clear()
    
    # Loop through the tokens and remove stopwords and punctuation
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            # Stemming using Porter Stemmer
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i))
    
    # Join the processed tokens back into a single string
    return " ".join(y)

In [56]:
# Initialize TF-IDF Vectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [57]:
#introspect data
spam_df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
6650,spam,Receive free tokens when you sign up for our n...
6651,spam,Double your Bitcoin in just 24 hours with our ...
6652,spam,Become a crypto expert with our comprehensive ...
6653,spam,Claim your free Ethereum giveaway now! Limited...


In [58]:
#inspect data
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4617,4325,"Sorry, I'll call later",28
spam,2037,851,Unlock the secrets to success with our proven ...,24


In [59]:
#turn spam/ham into numerical data, creating a new column for "spam"
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [60]:
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
6650,spam,Receive free tokens when you sign up for our n...,1
6651,spam,Double your Bitcoin in just 24 hours with our ...,1
6652,spam,Become a crypto expert with our comprehensive ...,1
6653,spam,Claim your free Ethereum giveaway now! Limited...,1


In [61]:
# create train/test split
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message, spam_df.spam, test_size = 0.25)

In [62]:
x_train

3611                                Shall i get my pouch?
5119    My darling sister. How are you doing. When's s...
5309    2p per min to call Germany 08448350055 from yo...
561     I'm gonna say no. Sorry. I would but as normal...
3910    This is the 2nd time we have tried to contact ...
                              ...                        
2961    Awesome question with a cute answer: Someone a...
5462    Congratulations! You're our lucky winner! Clai...
5613    Hurry! This offer ends tonight. Click now to c...
2315                                I'm meeting Darren...
3901                           I am on the way to ur home
Name: Message, Length: 4991, dtype: object

In [63]:
x_train.describe()

count                       4990
unique                      4037
top       Sorry, I'll call later
freq                          22
Name: Message, dtype: object

In [64]:
#find word count and store data as a matrix
# Remove rows with NaN values in the x_train DataFrame
x_train = x_train.dropna()

# Find word count and store data as a matrix
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [65]:
x_train_count

<4990x7749 sparse matrix of type '<class 'numpy.int64'>'
	with 69353 stored elements in Compressed Sparse Row format>

In [66]:
#matrix display
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [67]:
# train model
# Remove rows with NaN values in the x_train DataFrame
x_train = x_train.dropna()

# Adjust the y_train Series to have the same index as the cleaned x_train DataFrame
y_train = y_train.loc[x_train.index]

# Train the model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [68]:
#pre-test ham
email_ham = ["hey wanna meet for a cricket match?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [69]:
#pre-test spam
email_spam = ["free sale bonus"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [70]:
# test model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)


0.984375

In [71]:
# random test
email_ham = ["Big Earn Money Guaranteed"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([1], dtype=int64)

In [72]:
# random test
email_spam = ["sorry need time thankyou"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([0], dtype=int64)

In [73]:
# random test
email_spam = ["Can I take you out for a cup of coffee?"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([0], dtype=int64)

In [74]:
# random test
email_ham = ["guarantee: skyrocket your income with this incredible opportunity!"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([1], dtype=int64)