In [1]:
# Import all the required libraries 
import numpy as np
import pandas as pd
import re
import string


#import stopwords and text processing libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
nltk.download("wordnet")
nltk.download("stopwords")
nltk.download("punkt")


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mathan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
#import machine learning libraries

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv("email.csv")
df.head()

Unnamed: 0,text,spam
0,Subject: this free 7 - day trial will prove th...,1
1,"Subject: followup from iris mack hi , thank ...",0
2,Subject: make your rivals envy lt is really h...,1
3,Subject: re : telephone interview with the enr...,0
4,Subject: a 1 time charge add your property / s...,1


In [4]:
df.isnull().sum()

text    0
spam    0
dtype: int64

In [5]:
df = df.drop_duplicates(subset = ["text"], keep = "first")
df.reset_index(drop = True, inplace = True)

In [6]:
# create a function for preprocessing 
def preprocessing_text(text):

    #convert all to lowercase
    text= text.lower()

    #remove puntuations
    text = text.translate(str.maketrans("", "", string.punctuation))

    #remove stopword
    stop_word = set(stopwords.words('english'))
    text_tokens = word_tokenize(text)
    filtered_words = [word for word in text_tokens if word not in stop_word]

    #stemming
    ps = PorterStemmer()
    Stemmed_words = [ps.stem(w) for w in filtered_words]


    #lemmitizing
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(s, pos = 'a') for s in Stemmed_words]

    return " ".join(lemma_words)
  

In [7]:
new_data = df.copy()

In [8]:
# apply preprocessing to the description of new dataset

new_data["text"] = new_data["text"].apply(preprocessing_text)
new_data.head(20)

Unnamed: 0,text,spam
0,subject free 7 day trial prove get readi beach...,1
1,subject followup iri mack hi thank email inde ...,0
2,subject make rival envi lt realli hard recolle...,1
3,subject telephon interview enron corp research...,0
4,subject 1 time charg add properti servic addit...,1
5,subject uk portfolio book setup risktrac david...,0
6,subject ctfg trend upward compani begin rollou...,1
7,subject failur notic mail deliveri agent messa...,1
8,subject profession advertis dear projecthoneyp...,1
9,subject erectil dysfunct ruin sex life multipl...,1


In [10]:
# split the dataset  into test and train 
# 90% train , 10% test and random state 212
x_train_hl, x_test_hl, y_train_hl, y_test_hl = train_test_split(new_data["text"], new_data.spam, test_size = 0.1, random_state = 212)

In [11]:
%%time
# pipeline creation 
# 1. CountVectorizer
# 2. TfidTransformer
# 3. MultinomialNB
 
pipe = Pipeline([
                 ("vect", CountVectorizer()),
                 ("tfidf", TfidfTransformer()),
                 ("model", MultinomialNB())
                 
])

nb_model_hl = pipe.fit(x_train_hl, y_train_hl)

# Fit the pipeline to the data
prediction = nb_model_hl.predict(x_test_hl)
print("Model Multinomial")
# predict on test dataset

# print accuracy score
print(round(accuracy_score(y_test_hl, prediction) * 100, 2))
#print confusion matrix
print(confusion_matrix(y_test_hl, prediction))
#print classification report
print(classification_report(y_test_hl, prediction))

Model Multinomial
100.0
[[7 0]
 [0 3]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         7
           1       1.00      1.00      1.00         3

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Wall time: 288 ms
