# Email Spam Filter

In [1]:
import pandas as pd
import numpy as np
import pandas as pd
import tkinter as tk
import tkinter.font as TkFont
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

In [2]:
original_dataset = pd.read_csv('dataset/emails_V2.csv', encoding = "ISO-8859-1")
original_dataset.head(5)

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1.0
1,Subject: the stock trading gunslinger fanny i...,1.0
2,Subject: unbelievable new homes made easy im ...,1.0
3,Subject: 4 color printing special request add...,1.0
4,"Subject: do not have money , get software cds ...",1.0


## 1. Data Preprocessing

### 1.1 Data Preprocessing - Handling Missing Data

In [3]:
original_dataset.isnull().sum()

text    3683
spam    3683
dtype: int64

In [4]:
original_dataset.dropna(inplace = True)
original_dataset.isnull().sum()

text    0
spam    0
dtype: int64

In [5]:
original_dataset

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1.0
1,Subject: the stock trading gunslinger fanny i...,1.0
2,Subject: unbelievable new homes made easy im ...,1.0
3,Subject: 4 color printing special request add...,1.0
4,"Subject: do not have money , get software cds ...",1.0
...,...,...
2038,Subject: the installation of the equipment you...,0.0
2039,Subject: all about current and near future gas...,0.0
2040,Subject: maureen ' s expenses it appears that...,0.0
2041,Subject: interview - numerical methods & finan...,0.0


In [6]:
for i in original_dataset.text[:3].values:
    print(i, '\n')

Subject: naturally irresistible your corporate identity  lt is really hard to recollect a company : the  market is full of suqgestions and the information isoverwhelminq ; but a good  catchy logo , stylish statlonery and outstanding website  will make the task much easier .  we do not promise that havinq ordered a iogo your  company will automaticaily become a world ieader : it isguite ciear that  without good products , effective business organization and practicable aim it  will be hotat nowadays market ; but we do promise that your marketing efforts  will become much more effective . here is the list of clear  benefits : creativeness : hand - made , original logos , specially done  to reflect your distinctive company image . convenience : logo and stationery  are provided in all formats ; easy - to - use content management system letsyou  change your website content and even its structure . promptness : you  will see logo drafts within three business days . affordability : your  mar

### 1.2 Data Preprocessing - Case Folding

In [7]:
dataset = original_dataset.copy()
dataset.text = dataset.text.str.lower()
dataset.head(5)

Unnamed: 0,text,spam
0,subject: naturally irresistible your corporate...,1.0
1,subject: the stock trading gunslinger fanny i...,1.0
2,subject: unbelievable new homes made easy im ...,1.0
3,subject: 4 color printing special request add...,1.0
4,"subject: do not have money , get software cds ...",1.0


### 1.3 Data Preprocessing - Data Cleaning

#### 1.3.1 Data Preprocessing - Data Cleaning - Removing subject mark

In [8]:
def remove_subject(s):
    return re.sub(r'subject[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_subject(x))

#### 1.3.2 Data Preprocessing - Data Cleaning - Removing reply mark

In [9]:
def remove_re(s):
    return re.sub(r're[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_re(x))

#### 1.3.3 Data Preprocessing - Data Cleaning - Removing forward mark

In [10]:
def remove_fwd(s):
    return re.sub(r'fwd?[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_fwd(x))

#### 1.3.4 Data Preprocessing - Data Cleaning - Removing carbon copy and blind carbon copy mark

In [11]:
def remove_cc_bcc(s):
    return re.sub(r'b?cc[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_cc_bcc(x))

#### 1.3.5 Data Preprocessing - Data Cleaning - Removing "to" mark

In [12]:
def remove_to(s):
    return re.sub(r'to[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_to(x))

#### 1.3.6 Data Preprocessing - Data Cleaning - Removing "from" mark

In [13]:
def remove_from(s):
    return re.sub(r'from[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_from(x))

#### 1.3.7 Data Preprocessing - Data Cleaning - Removing "sent" mark

In [14]:
def remove_sent(s):
    return re.sub(r'sent[\s]*:[\s]*', '', s)

dataset.text = dataset.text.apply(lambda x: remove_sent(x))

#### 1.3.8 Data Preprocessing - Data Cleaning - Removing email address

In [15]:
def remove_email(s):
    return re.sub(r'\w+(\s[.-]\s\w+)*\s@\s\w+(\s[.-]\s\w+)*(\s.\s\w+)+', 'email', s)

dataset.text = dataset.text.apply(lambda x: remove_email(x))

#### 1.3.9 Data Preprocessing - Data Cleaning - Removing "forwarded by" mark

In [16]:
def remove_fwd_by(s):
    return re.sub(r'forwarded by', '', s)

dataset.text = dataset.text.apply(lambda x: remove_fwd_by(x))

#### 1.3.10 Data Preprocessing - Data Cleaning - Removing "original message" mark

In [17]:
def remove_ori(s):
    return re.sub(r'original message', '', s)

dataset.text = dataset.text.apply(lambda x: remove_ori(x))

#### 1.3.11 Data Preprocessing - Data Cleaning - Removing date and time

In [18]:
def remove_date_time(s):
    return re.sub(r'\d+(\s*\/\s*\d+)*\s*\d+(\s*:\s*\d+)*\s*[ap]m', '', s)

dataset.text = dataset.text.apply(lambda x: remove_date_time(x))

#### 1.3.12 Data Preprocessing - Data Cleaning - Removing hashtag

In [19]:
def remove_hashtag(s):
    return re.sub(r'\s*#\s*[\w\d]+', ' hashtag ', s)

dataset.text = dataset.text.apply(lambda x: remove_hashtag(x))

#### 1.3.13 Data Preprocessing - Data Cleaning - Removing emoji

In [20]:
def remove_emoji(s):
    return re.sub(r'\s*\\s*[\w\d]{3}', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_emoji(x))

#### 1.3.14 Data Preprocessing - Data Cleaning - Removing link

In [21]:
def remove_link(s):
    return re.sub(r'(www(\s+.\s+\w+)+)|(https?\s+:(\s+\/)*\s+\w*(\s+.\s+\w+)*)', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_link(x))

#### 1.3.15 Data Preprocessing - Data Cleaning - Removing tab

In [22]:
def remove_tab(s):
    return re.sub(r'\\\s*tab\s*', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_tab(x))

#### 1.3.16 Data Preprocessing - Data Cleaning - Removing number

In [23]:
def remove_number(s):
    return re.sub(r'\s*\d+\s*', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_number(x))

#### 1.3.17 Data Preprocessing - Data Cleaning - Removing punctuation

In [24]:
def remove_punctuation(s):
    return re.sub(r'\s*[^\w\s]\s*', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_punctuation(x))

#### 1.3.18 Data Preprocessing - Data Cleaning - Removing line break

In [25]:
def remove_linebreak(s):
    return re.sub(r'\n{1,}', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_linebreak(x))

#### 1.3.19 Data Preprocessing - Data Cleaning - Removing two or more spaces

In [26]:
def remove_spaces(s):
    return re.sub(r'\s{2,}', ' ', s)

dataset.text = dataset.text.apply(lambda x: remove_spaces(x))

### 1.4 Data Preprocessing - Text Normalization

In [27]:
dictionary = pd.read_csv('dataset/spelling_variants_valid.csv', encoding = "ISO-8859-1")
dictionary.head(5)

Unnamed: 0,variant_spelling,standard_spelling
0,accually,actually
1,addidas,adidas
2,adn,and
3,aer,are
4,aeroplane,airplane


In [28]:
def normalize(data):
    for index in range(len(data)):
        d = dictionary.set_index('variant_spelling')['standard_spelling'].to_dict()
        p = re.compile(r'\b(' + '|'.join(d.keys()) + r')\b')
        b = p.sub(lambda x: d[x.group()], data.text.iloc[index])
        data.loc[index, 'text'] = b
        
normalize(dataset)
dataset

Unnamed: 0,text,spam
0,naturally irresistible your corporate identity...,1.0
1,the stock trading gunslinger fanny is merrill ...,1.0
2,unbelievable new homes made easy im wanting to...,1.0
3,color printing special request additional inf...,1.0
4,do not have money get software cds from here s...,1.0
...,...,...
2038,the installation of the equipment you ordered ...,0.0
2039,all about current and near future gas power ma...,0.0
2040,maureen s expenses it appears that administrat...,0.0
2041,interview numerical methods finance dear tanya...,0.0


### 1.5 Data Preprocessing - Stemming

In [29]:
ps = PorterStemmer()

def stemming(data):
    for index in range(len(data)):
        sentence = data.text.iloc[index]
        words = word_tokenize(sentence)
        stem_sentence = []
        for word in words:
            stem_sentence.append(ps.stem(word))
            stem_sentence.append(" ")
        data.loc[index, 'text'] = "".join(stem_sentence)

stemming(dataset)
dataset

Unnamed: 0,text,spam
0,natur irresist your corpor ident lt is realli ...,1.0
1,the stock trade gunsling fanni is merril but m...,1.0
2,unbeliev new home made easi im want to show yo...,1.0
3,color print special request addit inform now c...,1.0
4,do not have money get softwar cd from here sof...,1.0
...,...,...
2038,the instal of the equip you order is complet a...,0.0
2039,all about current and near futur ga power mark...,0.0
2040,maureen s expens it appear that administr maur...,0.0
2041,interview numer method financ dear tanya it wa...,0.0


### 1.6 Data Preprocessing - Removing Stop Words

In [30]:
stop_words = set(stopwords.words('english'))

def remove_sw(data):
    for index in range(len(data)):
        sentence = data.text.iloc[index]
        words = word_tokenize(sentence)
        filtered_sentence = [w for w in words if not w in stop_words]
        filtered_sentence = []
        for word in words:
            if word not in stop_words:
                filtered_sentence.append(word)
                filtered_sentence.append(" ")
        data.loc[index, 'text'] = "".join(filtered_sentence)

remove_sw(dataset)
dataset

Unnamed: 0,text,spam
0,natur irresist corpor ident lt realli hard rec...,1.0
1,stock trade gunsling fanni merril muzo colza a...,1.0
2,unbeliev new home made easi im want show thi h...,1.0
3,color print special request addit inform click...,1.0
4,money get softwar cd softwar compat great grow...,1.0
...,...,...
2038,instal equip order complet automat notif syste...,0.0
2039,current near futur ga power market comment app...,0.0
2040,maureen expens appear administr maureen expens...,0.0
2041,interview numer method financ dear tanya wa gr...,0.0


### 1.7 Data Preprocessing - Removing Emails Having Less than or Equal 3 Words

In [31]:
dataset['doc_length'] = dataset.text.apply(lambda x: len(x.split()))
dataset = dataset[~dataset.doc_length.isin([1,2,3])]
dataset

Unnamed: 0,text,spam,doc_length
0,natur irresist corpor ident lt realli hard rec...,1.0,217
1,stock trade gunsling fanni merril muzo colza a...,1.0,63
2,unbeliev new home made easi im want show thi h...,1.0,42
3,color print special request addit inform click...,1.0,44
4,money get softwar cd softwar compat great grow...,1.0,18
...,...,...,...
2038,instal equip order complet automat notif syste...,0.0,36
2039,current near futur ga power market comment app...,0.0,13
2040,maureen expens appear administr maureen expens...,0.0,37
2041,interview numer method financ dear tanya wa gr...,0.0,175


In [32]:
dataset.drop(['doc_length'], axis = 1, inplace = True)
dataset

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,text,spam
0,natur irresist corpor ident lt realli hard rec...,1.0
1,stock trade gunsling fanni merril muzo colza a...,1.0
2,unbeliev new home made easi im want show thi h...,1.0
3,color print special request addit inform click...,1.0
4,money get softwar cd softwar compat great grow...,1.0
...,...,...
2038,instal equip order complet automat notif syste...,0.0
2039,current near futur ga power market comment app...,0.0
2040,maureen expens appear administr maureen expens...,0.0
2041,interview numer method financ dear tanya wa gr...,0.0


## 2. Feature Extraction

In [33]:
x = dataset['text']
y = dataset['spam']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [34]:
vectorizer = TfidfVectorizer()
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector = vectorizer.transform(x_test)

## 3. Classification and Evaluation

In [35]:
model = LogisticRegression()
model.fit(x_train_vector, y_train)

y_pred = model.predict(x_test_vector)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print('Accuracy:', acc)
print('F1 Score:', f1)

Accuracy: 0.9484029484029484
F1 Score: 0.9606003752345216


## 4. GUI Building

In [36]:
root = tk.Tk()

root.title("Email Spam Filter")
root.iconbitmap(r'asset/message-mail-envelope-email-spam-inbox_108649.ico')
root.geometry('600x600')
root.resizable(width = False, height = False)

header = TkFont.Font(family = "San Francisco", size = 36, weight = "bold")
subheader = TkFont.Font(family = "San Francisco", size = 16)
body = TkFont.Font(family = "San Francisco", size = 10)

canvas1 = tk.Canvas(root, width = 600, height = 700)
canvas1.pack()

title = tk.Label(text = "Email Spam Filter", font = header)
canvas1.create_window(300, 50, window = title)

label1 = tk.Label(root, text = "Input your text", font = subheader)
canvas1.create_window(300, 120, window = label1)

entry1 = tk.Text(root, height = 8, width = 56)
canvas1.create_window(300, 210, window = entry1)

label2 = tk.Label(root, text = "Preprocessing", font = subheader)
canvas1.create_window(300, 350, window = label2)

result = tk.Label(root, font = body, height = 8, width = 56, bg = "black")
canvas1.create_window(300, 440, window = result)

def apply():
    global email
    email = str(entry1.get("1.0", tk.END))
    
    email = email.lower()
    email = remove_hashtag(email)
    email = remove_subject(email)
    email = remove_re(email)
    email = remove_fwd(email)
    email = remove_cc_bcc(email)
    email = remove_to(email)
    email = remove_from(email)
    email = remove_sent(email)
    email = remove_email(email)
    email = remove_fwd_by(email)
    email = remove_ori(email)
    email = remove_date_time(email)
    email = remove_hashtag(email)
    email = remove_emoji(email)
    email = remove_link(email)
    email = remove_tab(email)
    email = remove_number(email)
    email = remove_punctuation(email)
    email = remove_linebreak(email)
    email = remove_spaces(email)
    
    result = tk.Label(root, text = email, font = body, height = 8, width = 56, bg = "black", fg = "white", wraplength = 425)
    canvas1.create_window(300, 440, window = result)
    
    data = {'text': [email]}
    vec = vectorizer.transform(data['text'])
    pred = model.predict(vec)
    
    if(pred == 1):
        label2 = tk.Label(root, text = "Spam", font = subheader, fg = "red")
    elif(pred == 0):
        label2 = tk.Label(root, text = "Ham", font = subheader, fg = "green")

    canvas1.create_window(300, 550, window = label2)

button1 = tk.Button (root, text = "Execute", font = body, command = apply, fg = "white", bg = "black", activebackground = "white", activeforeground = "black")
canvas1.create_window(300, 305, window = button1)

root.mainloop()