In [13]:
import numpy as np
import pandas as pd
import re
import json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report,confusion_matrix
import nltk
from bs4 import BeautifulSoup
from keras.models import load_model
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# remove the head of eamil
def remove_header(email):
    """remove the header from an email"""
    return email[email.index('\n\n'):]


def remove_html_tags(input):
    soup = BeautifulSoup(input, 'html.parser')
    return soup.get_text()

# replace URLs with oussama word and emails with boussaid
def remove_hyperlink(word):
    regex_links = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    word_without_links =  re.sub(regex_links,"oussama", word)
    regex_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    return re.sub(regex_email,"boussaid", word_without_links)


# make word in lower case
def to_lower(word):
    return word.lower()


# remove whitespaces
def remove_whitespace(word):
    return word.strip()


def remove_digits(word):
  '''This function removes all the numbers'''
  return re.sub('\d+', '', word)

def remove_underscores(word):
  '''This function removes all the underscores'''
  return re.sub(r'_', '', word)


def remove_special_characters(word):
  '''This function removes all the special characters'''
  return re.sub('\W', ' ', word)

# remove stop words
stopwords_english = stopwords.words('english')
def remove_stopwords(word,stopword_list=stopwords_english):
  '''This function removes the stop words'''
  word_list = word.split(" ")
  cleaned_txt = [w for w in word_list if not w in stopword_list]
  cleaned_string = " ".join(cleaned_txt)

  return cleaned_string

def EmailsPreprocessor(sentence):

    Preprocessor_utils = [remove_header,
                      remove_html_tags,
                      to_lower,
                      remove_hyperlink,
                      remove_whitespace,
                      remove_digits,
                      remove_underscores,
                      remove_stopwords,
                      remove_special_characters]

    for tool in Preprocessor_utils:
        sentence = tool(sentence)

    return sentence


def Tokenizer_email(email):
  max_len = 3000 # max number of words in a question to use
  # Load word_index from the saved JSON file
  with open('word_index.json', 'r') as json_file:
      loaded_word_index = json.load(json_file)

  tokenizer = Tokenizer()
  tokenizer.word_index = loaded_word_index
  eamil_seq = np.array(tokenizer.texts_to_sequences([email]))
  # print(eamil_seq)

  return pad_sequences(eamil_seq,maxlen=max_len)


def Emails_Classifier(email):
  email_pro = EmailsPreprocessor(email)
  print(email_pro)
  email_tok = Tokenizer_email(email_pro)
  print(email_tok)
  # load model's metadata
  model = load_model('Email_classifier.h5')
  # Model predict  a number from 0.0 to 1.0
  y_pred = model.predict(email_tok)

  print(y_pred)

  if y_pred[0] > 0.5 :
    return 'Spam'

  else :
    return 'Ham'



# Get New Data

In [25]:
import os
import glob
from urllib.request import urlretrieve
import tarfile
import shutil


DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

EASY_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
SPAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'
SPAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'



def download_dataset(url):
    """download and unzip data from a url into the specified path"""

    # create directory if it doesn't exist
    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)

    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)

    # download the tar file if it doesn't exist
    try:
        print("Downloading", tarpath)
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)

    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)

        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)

    return dirname


def load_dataset(dirpath):
    """load emails from the specified directory"""

    files = []
    filepaths = glob.glob(dirpath + '/*')
    for path in filepaths:
        with open(path, 'rb') as f:
            byte_content = f.read()
            str_content = byte_content.decode('utf-8', errors='ignore')
            files.append(str_content)

    return files


# download the data
spam_dir = download_dataset(SPAM_URL)
easy_ham_dir = download_dataset(EASY_HAM_URL)
spam2 = download_dataset(SPAM2_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)



# load the datasets from datasets/tar/*
spam = load_dataset(spam_dir)
easy_ham = load_dataset(easy_ham_dir)
spam2 = load_dataset(spam2)
hard_ham = load_dataset(hard_ham_dir)


print("Emails Ham :", len(easy_ham))
print("Emails Spam :" ,len(spam))


Downloading datasets/tar/20021010_spam.tar.bz2
Downloading datasets/tar/20030228_easy_ham.tar.bz2
Downloading datasets/tar/20050311_spam_2.tar.bz2
Downloading datasets/tar/20030228_hard_ham.tar.bz2
Emails Ham : 2500
Emails Spam : 501


In [10]:
easy_ham[1]

'From rssfeeds@jmason.org  Thu Sep 26 16:34:02 2002\nReturn-Path: <rssfeeds@spamassassin.taint.org>\nDelivered-To: yyyy@localhost.spamassassin.taint.org\nReceived: from localhost (jalapeno [127.0.0.1])\n\tby jmason.org (Postfix) with ESMTP id 6664D16F18\n\tfor <jm@localhost>; Thu, 26 Sep 2002 16:34:00 +0100 (IST)\nReceived: from jalapeno [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Thu, 26 Sep 2002 16:34:00 +0100 (IST)\nReceived: from dogma.slashnull.org (localhost [127.0.0.1]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8QFSTg24435 for\n    <jm@jmason.org>; Thu, 26 Sep 2002 16:28:29 +0100\nMessage-Id: <200209261528.g8QFSTg24435@dogma.slashnull.org>\nTo: yyyy@spamassassin.taint.org\nFrom: joelonsoftware <rssfeeds@spamassassin.taint.org>\nSubject: We\'re trying to decide if FogBUGZ 3.0 should support custom\n    fields. Histor\nDate: Thu, 26 Sep 2002 15:28:28 -0000\nContent-Type: text/plain; encoding=utf-8\n\nURL: http://www.j

In [29]:
Emails_Classifier(spam2[2])

multi part message mime format          nextpartace db content type  text plain   charset  windows   content transfer encoding  bit   ann arbor annuity exchange    giveaway    think annuities   think ann arbor  just short list many companies represent         fill form free entry    giveaway    name      e mail      phone      fax      city     state                      agent use only  employees family members ann arbor annuity exchange  subsidiaries ineligible   we want anybody receive mailing wish to receive them  professional communication sent insurance professionals  removed mailing list  reply to this message  instead  go here  oussama legal notice             nextpartace db content type  text html   charset  iso    content transfer encoding  quoted printable       giveaway                                                          just short list many companies   represent                                                                                                             

'Spam'