In [1]:
import pandas as pd

spam_df = pd.read_csv("data/spamhamdata.csv", sep="\t", header=None)

In [2]:
spam_df.columns = ["Labels", "Messages"]

In [3]:
spam_df.head(2)

Unnamed: 0,Labels,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


### Data Cleaning and text preprocessing

In [4]:
from nltk.corpus import stopwords

custom_stopwords = ["aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
                    'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
                    'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan',
                    "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
                    'wouldn', "wouldn't"
                   ]

In [5]:
import re
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [6]:
corpus = []

for i in range(0, len(spam_df)):
    review = re.sub('[^a-zA-Z0-9]', " ", spam_df['Messages'][i]) # remove all non alphabetic symbols
    review = review.lower() #convert all letters to lower alphabetics
    review = review.split() # split words to return a list of words
    review = [ps.stem(word) for word in review if not word in custom_stopwords] # returns all root words
    review = " ".join(review)
    corpus.append(review)

In [7]:
from sklearn.model_selection import train_test_split

y = spam_df['Labels']
y = y.map({"spam": 1, "ham" : 0})
y.astype(int)
X_train, X_test, y_train, y_test = train_test_split(corpus,y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

In [8]:
spam_df["Labels"].value_counts()

ham     4825
spam     747
Name: Labels, dtype: int64

In [9]:
### creating BOW(Bag of words)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = CountVectorizer(max_features=2000, ngram_range=(1,2), binary=True)

X_train = cv.fit_transform(X_train).toarray()

In [10]:
X_test = cv.transform(X_test).toarray()

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [12]:
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

In [13]:
log_reg.score(X_train, y_train)

0.9961857751851021

In [14]:
log_reg.score(X_test, y_test)

0.9802690582959641

In [15]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_recall_curve, precision_score

y_pred = log_reg.predict(X_test)
y_train_pred = log_reg.predict(X_train)
print(f"y_pred is \n {y_pred}")
print(f"y_train_pred is \n {y_train_pred}")

y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)

F1_score = f1_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f"F1 Score: {F1_score}")
print("+++++" * 30)
print(f"Classification Report: \n {class_report}")

y_pred is 
 [0 0 0 ... 0 0 0]
y_train_pred is 
 [0 0 0 ... 1 1 0]
F1 Score: 0.9202898550724637
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Classification Report: 
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.85      0.92       149

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [15]:
email = [input("Enter your email here>>>> \n")]

email = cv.transform(email).toarray()
pred = log_reg.predict(email)
if pred[0] == 0:
    print("The model predicts ham")
else:
    print("The model predicts spam")

Enter your email here>>>> 
Hi, you have won 20000$ today, check out this email (victoropeyemi@outlook.com) to claim reward
The model predicts spam


###### To make a better prediction, we shall write a function cleanse user taken input

In [17]:
import contractions
import string

In [21]:
def preprocess(text):
    corpus = []
    text = re.sub('[^a-zA-Z0-9]', " ", text)
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    text = url_pattern.sub(r"", text)
    text = text.lower()
    text = contractions.fix(text, slang=True)
    text = string.punctuation
    text = text.translate(str.maketrans("", "", text))
    text = text.split()
    text = [ps.stem(word) for word in text if not word in custom_stopwords]
    text = " ".join(text)
    corpus.append(text)
    return corpus

In [22]:
email = input("Enter your email here>>>> \n")

corpus = preprocess(email)

Enter your email here>>>> 
Hi, you have won 20000$ today, check out this email (victoropeyemi@outlook.com) to claim reward


In [25]:
email = cv.transform(corpus).toarray()
pred = log_reg.predict(email)
if pred[0] == 0:
    print("The model predicts ham")
else:
    print("The model predicts spam")

The model predicts ham


In [26]:
import pickle

with open("cv.pickle", "wb") as cv_file:
    pickle.dump(cv, cv_file)

In [27]:
with open("preprocessor.pkl", "wb") as preprocessor_file:
    pickle.dump(preprocess, preprocessor_file)

with open("log_reg.pkl", "wb") as log_reg_model:
    pickle.dump(log_reg, log_reg_model)