## **SpamSieve : Email Classification Tool**


# Pipline

In [None]:
import numpy as np
import pandas as pd
import re
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report,confusion_matrix
import nltk
from bs4 import BeautifulSoup
from keras.models import load_model
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# remove the head of eamil
def remove_header(email):
    """remove the header from an email"""
    return email[email.index('\n\n'):]


def remove_html_tags(input):
    soup = BeautifulSoup(input, 'html.parser')
    return soup.get_text()

# replace URLs with oussama word and emails with boussaid
def remove_hyperlink(word):
    regex_links = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    word_without_links =  re.sub(regex_links,"oussama", word)
    regex_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    return re.sub(regex_email,"boussaid", word_without_links)


# make word in lower case
def to_lower(word):
    return word.lower()


# remove whitespaces
def remove_whitespace(word):
    return word.strip()


def remove_digits(word):
  '''This function removes all the numbers'''
  return re.sub('\d+', '', word)

def remove_underscores(word):
  '''This function removes all the underscores'''
  return re.sub(r'_', '', word)


def remove_special_characters(word):
  '''This function removes all the special characters'''
  return re.sub('\W', ' ', word)

# remove stop words
stopwords_english = stopwords.words('english')
def remove_stopwords(word,stopword_list=stopwords_english):
  '''This function removes the stop words'''
  word_list = word.split(" ")
  cleaned_txt = [w for w in word_list if not w in stopword_list]
  cleaned_string = " ".join(cleaned_txt)

  return cleaned_string

def EmailsPreprocessor(sentence):

    Preprocessor_utils = [remove_header,
                      remove_html_tags,
                      to_lower,
                      remove_hyperlink,
                      remove_whitespace,
                      remove_digits,
                      remove_underscores,
                      remove_stopwords,
                      remove_special_characters]

    for tool in Preprocessor_utils:
        sentence = tool(sentence)

    return sentence


def Tokenizer_email(email):
  max_len = 3000 # max number of words in a question to use
  # Load word_index from the saved JSON file
  with open('word_index.json', 'r') as json_file:
      loaded_word_index = json.load(json_file)

  tokenizer = Tokenizer()
  tokenizer.word_index = loaded_word_index
  eamil_seq = np.array(tokenizer.texts_to_sequences([email]))
  # print(eamil_seq)

  return pad_sequences(eamil_seq,maxlen=max_len)


def Emails_Classifier(email):
  email_pro = EmailsPreprocessor(email)
  print(email_pro)
  email_tok = Tokenizer_email(email_pro)
  print(email_tok)
  # load model's metadata
  model = load_model('Email_classifier.h5')
  # Model predict  a number from 0.0 to 1.0
  y_pred = model.predict(email_tok)

  print(y_pred)

  if y_pred[0] > 0.5 :
    return 'Spam'

  else :
    return 'Ham'



# Get New Data

In [None]:
import os
import glob
from urllib.request import urlretrieve
import tarfile
import shutil


DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

EASY_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'
SPAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'



def download_dataset(url):
    """download and unzip data from a url into the specified path"""

    # create directory if it doesn't exist
    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)

    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)

    # download the tar file if it doesn't exist
    try:
        print("Downloading", tarpath)
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)

    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)

        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)

    return dirname


def load_dataset(dirpath):
    """load emails from the specified directory"""

    files = []
    filepaths = glob.glob(dirpath + '/*')
    for path in filepaths:
        with open(path, 'rb') as f:
            byte_content = f.read()
            str_content = byte_content.decode('utf-8', errors='ignore')
            files.append(str_content)

    return files


# download the data
spam2 = download_dataset(SPAM2_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)



# load the datasets from datasets/tar/*
spam2 = load_dataset(spam2)
hard_ham = load_dataset(hard_ham_dir)


print("Emails Ham :", len(spam2))
print("Emails Spam :" ,len(hard_ham))


Downloading datasets/tar/20050311_spam_2.tar.bz2
Downloading datasets/tar/20030228_hard_ham.tar.bz2
Emails Ham : 1396
Emails Spam : 250


In [None]:
print(spam2[1])

From paulson6@arabia.com  Mon Jun 25 13:11:28 2001
Return-Path: <paulson6@arabia.com>
Delivered-To: yyyy@netnoteinc.com
Received: from exchange.harbin.cc (unknown [202.97.247.130]) by
    mail.netnoteinc.com (Postfix) with ESMTP id CABAA114155 for
    <jm7@netnoteinc.com>; Mon, 25 Jun 2001 12:18:19 +0100 (IST)
Received: from 207.173.146.92 (eli-207-173-146-92.fgn.net
    [207.173.146.92]) by exchange.harbin.cc with SMTP (Microsoft Exchange
    Internet Mail Service Version 5.5.2650.21) id NM8LGL9X; Mon,
    25 Jun 2001 18:26:25 +0800
Message-Id: <0000104257bd$00001f24$00007177@>
To: <Undisclosed Recipients@netnoteinc.com>
From: paulson6@arabia.com
Subject: *THE LEGAL CABLE TV DESCRAMBLER*
Date: Sun, 24 Jun 2001 20:45:01 -0700
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Priority: 3
X-Msmail-Priority: Normal


<HTML>
<BODY>

<FONT face=3D"MS Sans Serif">
<FONT size=3D2><B> <BR>
</B>
<FONT color=3D"#008080"><B> NOTE: THIS IS AN ADVERTISEMENT FOR LEGAL TV<BR=
>
   DE-SC

In [None]:
Emails_Classifier(spam2[2])

multi part message mime format          nextpartace db content type  text plain   charset  windows   content transfer encoding  bit   ann arbor annuity exchange    giveaway    think annuities   think ann arbor  just short list many companies represent         fill form free entry    giveaway    name      e mail      phone      fax      city     state                      agent use only  employees family members ann arbor annuity exchange  subsidiaries ineligible   we want anybody receive mailing wish to receive them  professional communication sent insurance professionals  removed mailing list  reply to this message  instead  go here  oussama legal notice             nextpartace db content type  text html   charset  iso    content transfer encoding  quoted printable       giveaway                                                          just short list many companies   represent                                                                                                             

'Spam'