# **Importing Data**



In [102]:
import os
import numpy as np
import pandas as pd
import glob
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
nltk.download('stopwords')

DATASETS_DIR = 'datasets'
MODELS_DIR = 'models'
TAR_DIR = os.path.join(DATASETS_DIR, 'tar')

# SPAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
SPAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
# EASY_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'
EASY_HAM2_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
HARD_HAM_URL = 'https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from urllib.request import urlretrieve
import tarfile
import shutil

def download_dataset(url):
    """download and unzip data from a url into the specified path"""

    # create directory if it doesn't exist
    if not os.path.isdir(TAR_DIR):
        os.makedirs(TAR_DIR)

    filename = url.rsplit('/', 1)[-1]
    tarpath = os.path.join(TAR_DIR, filename)

    # download the tar file if it doesn't exist
    try:
        print("Downloading", tarpath)
        tarfile.open(tarpath)
    except:
        urlretrieve(url, tarpath)

    with tarfile.open(tarpath) as tar:
        dirname = os.path.join(DATASETS_DIR, tar.getnames()[0])
        if os.path.isdir(dirname):
            shutil.rmtree(dirname)
        tar.extractall(path=DATASETS_DIR)

        cmds_path = os.path.join(dirname, 'cmds')
        if os.path.isfile(cmds_path):
            os.remove(cmds_path)

    return dirname


def load_dataset(dirpath):
    """load emails from the specified directory"""

    files = []
    filepaths = glob.glob(dirpath + '/*')
    for path in filepaths:
        with open(path, 'rb') as f:
            byte_content = f.read()
            str_content = byte_content.decode('utf-8', errors='ignore')
            files.append(str_content)
            # files.append(email.parser.BytesParser(policy=email.policy.default).parse(f))

    return files


In [None]:
# download the data

# spam_dir = download_dataset(SPAM_URL)
spam2_dir = download_dataset(SPAM2_URL)
# easy_ham_dir = download_dataset(EASY_HAM_URL)
easy_ham2_dir = download_dataset(EASY_HAM2_URL)
hard_ham_dir = download_dataset(HARD_HAM_URL)


# load the datasets from datasets/tar/*

# spam = load_dataset(spam_dir)
spam2 = load_dataset(spam2_dir)
# easy_ham = load_dataset(easy_ham_dir)
easy_ham2 = load_dataset(easy_ham2_dir)
hard_ham = load_dataset(hard_ham_dir)

print("Emails Ham :", len( easy_ham2 + hard_ham))
print("Emails Spam :" ,len(spam2))

Downloading datasets/tar/20050311_spam_2.tar.bz2
Downloading datasets/tar/20030228_easy_ham_2.tar.bz2
Downloading datasets/tar/20030228_hard_ham.tar.bz2
Emails Ham : 1650
Emails Spam : 1396


In [107]:
from  sklearn.utils import shuffle

# create the full dataset
X = spam2 + easy_ham2 + hard_ham
lables = np.concatenate((np.ones(len(spam2)), np.zeros(len(easy_ham2) + len(hard_ham)))).astype(int)

# shuffle the dataset
X, lables = shuffle(X, lables, random_state=42)

print("Data Impoted!!!!")

Data Impoted!!!!


# **Data Preparation**

In [60]:
# remove the head of eamil
def remove_header(email):
    """remove the header from an email"""
    return email[email.index('\n\n'):]


def remove_html_tags(input):
    soup = BeautifulSoup(input, 'html.parser')
    return soup.get_text()

# replace URLs with oussama word and emails with boussaid
def remove_hyperlink(word):
    regex_links = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    word_without_links =  re.sub(regex_links,"oussama", word)
    regex_email = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b'
    return re.sub(regex_email,"boussaid", word_without_links)


# make word in lower case
def to_lower(word):
    return word.lower()


# remove whitespaces
def remove_whitespace(word):
    return word.strip()


def remove_digits(word):
  '''This function removes all the numbers'''
  return re.sub('\d+', '', word)

def remove_underscores(word):
  '''This function removes all the underscores'''
  return re.sub(r'_', '', word)


def remove_special_characters(word):
  '''This function removes all the special characters'''
  return re.sub('\W', ' ', word)

# remove stop words
stopwords_english = stopwords.words('english')
def remove_stopwords(word,stopword_list=stopwords_english):
  '''This function removes the stop words'''
  word_list = word.split(" ")
  cleaned_txt = [w for w in word_list if not w in stopword_list]
  cleaned_string = " ".join(cleaned_txt)

  return cleaned_string

print("Preprocessing functions readyy!")

Preprocessing functions readyy!


In [None]:
print(X[0])

Received: from hq.pro-ns.net (localhost [127.0.0.1])
	by hq.pro-ns.net (8.12.5/8.12.5) with ESMTP id g6NEkmhY030813
	(version=TLSv1/SSLv3 cipher=EDH-DSS-DES-CBC3-SHA bits=168 verify=NO)
	for <cypherpunks-forward@ds.pro-ns.net>; Tue, 23 Jul 2002 09:46:48 -0500 (CDT)
	(envelope-from cpunks@hq.pro-ns.net)
Received: (from cpunks@localhost)
	by hq.pro-ns.net (8.12.5/8.12.5/Submit) id g6NEklSB030808
	for cypherpunks-forward@ds.pro-ns.net; Tue, 23 Jul 2002 09:46:47 -0500 (CDT)
Received: from einstein.ssz.com (cpunks@[207.200.56.4])
	by hq.pro-ns.net (8.12.5/8.12.5) with ESMTP id g6NEfuhX028821
	for <cypherpunks@ds.pro-ns.net>; Tue, 23 Jul 2002 09:41:56 -0500 (CDT)
	(envelope-from cpunks@einstein.ssz.com)
Received: (from cpunks@localhost)
	by einstein.ssz.com (8.8.8/8.8.8) id JAA13628
	for cypherpunks@ds.pro-ns.net; Tue, 23 Jul 2002 09:50:29 -0500
Received: (from mdom@localhost)
	by einstein.ssz.com (8.8.8/8.8.8) id JAA13602
	for cypherpunks-outgoing; Tue, 23 Jul 2002 09:47:45 -0500
Received: 

In [None]:
# remove head
email_without_head = remove_header(X[0])
print(email_without_head)




<> Get up to 4 receivers installed in 4 rooms! 
<> No Equipment To Buy. 
<> Fist 3 Months of Free Service! - 
<> Up to 170 CHANNELS of CD quality sound and picture 
<> PROGRAMMING LESS EXPENSIVE than cable TV in most markets 

You can receive FREE INSTALLATION of a Dish Network 
Satellite TV System! You can also upgrade to a Personal Digital Video Recorder. 
(Retail value $499 if you had to buy this!) 

Click here to get your FREE INSTALLATION of a Dish Network satellite TV 
System and 3 months of free service before this promotion expires: 

http://www.proleadcom.com/discsat/index.php


--DeathToSpamDeathToSpamDeathToSpam--


-------------------------------------------------------
This sf.net email is sponsored by:ThinkGeek
Welcome to geek heaven.
http://thinkgeek.com/sf
_______________________________________________
Spamassassin-Sightings mailing list
Spamassassin-Sightings@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/spamassassin-sightings





In [65]:
email_without_html = remove_html_tags(email_without_head)
print(email_without_html)




<> Get up to 4 receivers installed in 4 rooms! 
<> No Equipment To Buy. 
<> Fist 3 Months of Free Service! - 
<> Up to 170 CHANNELS of CD quality sound and picture 
<> PROGRAMMING LESS EXPENSIVE than cable TV in most markets 

You can receive FREE INSTALLATION of a Dish Network 
Satellite TV System! You can also upgrade to a Personal Digital Video Recorder. 
(Retail value $499 if you had to buy this!) 

Click here to get your FREE INSTALLATION of a Dish Network satellite TV 
System and 3 months of free service before this promotion expires: 

http://www.proleadcom.com/discsat/index.php


--DeathToSpamDeathToSpamDeathToSpam--


-------------------------------------------------------
This sf.net email is sponsored by:ThinkGeek
Welcome to geek heaven.
http://thinkgeek.com/sf
_______________________________________________
Spamassassin-Sightings mailing list
Spamassassin-Sightings@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/spamassassin-sightings





In [66]:
# to lower
email_lower = to_lower(email_without_html)
print(email_lower)




<> get up to 4 receivers installed in 4 rooms! 
<> no equipment to buy. 
<> fist 3 months of free service! - 
<> up to 170 channels of cd quality sound and picture 
<> programming less expensive than cable tv in most markets 

you can receive free installation of a dish network 
satellite tv system! you can also upgrade to a personal digital video recorder. 
(retail value $499 if you had to buy this!) 

click here to get your free installation of a dish network satellite tv 
system and 3 months of free service before this promotion expires: 

http://www.proleadcom.com/discsat/index.php


--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email is sponsored by:thinkgeek
welcome to geek heaven.
http://thinkgeek.com/sf
_______________________________________________
spamassassin-sightings mailing list
spamassassin-sightings@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/spamassassin-sightings





In [41]:
# remove
email_without_urls = remove_hyperlink(email_lower)
print(email_without_urls)




<> get up to 4 receivers installed in 4 rooms! 
<> no equipment to buy. 
<> fist 3 months of free service! - 
<> up to 170 channels of cd quality sound and picture 
<> programming less expensive than cable tv in most markets 

you can receive free installation of a dish network 
satellite tv system! you can also upgrade to a personal digital video recorder. 
(retail value $499 if you had to buy this!) 

click here to get your free installation of a dish network satellite tv 
system and 3 months of free service before this promotion expires: 




--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email is sponsored by:thinkgeek
welcome to geek heaven.

_______________________________________________
spamassassin-sightings mailing list







In [44]:
# email without whitespace
email_without_whitespace = remove_whitespace(email_without_urls)
print(email_without_whitespace)

<> get up to 4 receivers installed in 4 rooms! 
<> no equipment to buy. 
<> fist 3 months of free service! - 
<> up to 170 channels of cd quality sound and picture 
<> programming less expensive than cable tv in most markets 

you can receive free installation of a dish network 
satellite tv system! you can also upgrade to a personal digital video recorder. 
(retail value $499 if you had to buy this!) 

click here to get your free installation of a dish network satellite tv 
system and 3 months of free service before this promotion expires: 




--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email is sponsored by:thinkgeek
welcome to geek heaven.

_______________________________________________
spamassassin-sightings mailing list


In [46]:
# remove numbers
email_without_numbers = remove_digits(email_without_whitespace)
print(email_without_numbers)

<> get up to  receivers installed in  rooms! 
<> no equipment to buy. 
<> fist  months of free service! - 
<> up to  channels of cd quality sound and picture 
<> programming less expensive than cable tv in most markets 

you can receive free installation of a dish network 
satellite tv system! you can also upgrade to a personal digital video recorder. 
(retail value $ if you had to buy this!) 

click here to get your free installation of a dish network satellite tv 
system and  months of free service before this promotion expires: 




--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email is sponsored by:thinkgeek
welcome to geek heaven.

_______________________________________________
spamassassin-sightings mailing list


In [50]:
# remove Underscore
email_without_under = remove_underscores(email_without_numbers)
print(email_without_under)

<> get up to  receivers installed in  rooms! 
<> no equipment to buy. 
<> fist  months of free service! - 
<> up to  channels of cd quality sound and picture 
<> programming less expensive than cable tv in most markets 

you can receive free installation of a dish network 
satellite tv system! you can also upgrade to a personal digital video recorder. 
(retail value $ if you had to buy this!) 

click here to get your free installation of a dish network satellite tv 
system and  months of free service before this promotion expires: 




--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email is sponsored by:thinkgeek
welcome to geek heaven.


spamassassin-sightings mailing list


In [52]:
# remove stop words
email_without_stopwords = remove_stopwords(email_without_under)
print(email_without_stopwords)

<> get  receivers installed  rooms! 
<> equipment buy. 
<> fist  months free service! - 
<>  channels cd quality sound picture 
<> programming less expensive cable tv markets 

you receive free installation dish network 
satellite tv system! also upgrade personal digital video recorder. 
(retail value $ buy this!) 

click get free installation dish network satellite tv 
system  months free service promotion expires: 




--deathtospamdeathtospamdeathtospam--


-------------------------------------------------------
this sf.net email sponsored by:thinkgeek
welcome geek heaven.


spamassassin-sightings mailing list


In [55]:
# remove without SP
email_without_SC = remove_special_characters(email_without_stopwords)
print(email_without_SC)

   get  receivers installed  rooms      equipment buy      fist  months free service         channels cd quality sound picture     programming less expensive cable tv markets   you receive free installation dish network  satellite tv system  also upgrade personal digital video recorder    retail value   buy this     click get free installation dish network satellite tv  system  months free service promotion expires         deathtospamdeathtospamdeathtospam                                                             this sf net email sponsored by thinkgeek welcome geek heaven    spamassassin sightings mailing list


In [71]:
# remove HTML tags
exemple_with_html_tags = remove_header(X[100])
print(exemple_with_html_tags)



<html>

<head>
<meta http-equiv=3D"Content-Language" content=3D"en-us">
<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dwindows-=
1252">
<meta name=3D"GENERATOR" content=3D"Microsoft FrontPage 4.0">
<meta name=3D"ProgId" content=3D"FrontPage.Editor.Document">
<title></title>
</head>

<body>

<table border=3D"0" width=3D"423" height=3D"166" bgcolor=3D"#800080">
  <tr>
    <td width=3D"464" height=3D"160">

<table border=3D"0" width=3D"100%" height=3D"110" bordercolor=3D"#FFFF99" =
bgcolor=3D"#FFFF00">
  <tr>
    <td width=3D"514" height=3D"22" bgcolor=3D"#FFFF99" align=3D"center">

<p><b><font face=3D"Verdana" color=3D"#FF0000" size=3D"2">NEW! -&gt;</font=
><font face=3D"Verdana" color=3D"#FF0000" size=3D"3">
Vigoral Herbal Love Enhancers </font><font face=3D"Verdana" color=3D"#FF00=
00" size=3D"2">&lt;- NEW!</font></b></p>

    </td>
  </tr>
  <tr>
    <td width=3D"514" height=3D"82" bgcolor=3D"#800080" align=3D"center">

<b><font face=3D"Times New Roman" size=3D"3"

In [72]:
exemple_with_html_tags_removed = remove_html_tags(exemple_with_html_tags)
print(exemple_with_html_tags_removed)

















NEW! ->
Vigoral Herbal Love Enhancers <- NEW!




Straight fr=
om our lab to you!
We Now Offer =
3
NEW - Special=
ly Formulated
& 100% Natural - products to help stimulate your moments with that special someone f=
or
Only
$24.99!






"Th=
e Most
Exciting Love
Making Experience We've Ever Had! Vigoral is #1, Hands Down!"       &=
nbsp;           &nb=
sp;
  
- Ricardo & Isabelle P. of Las Vegas, NV<=
/font>










SUPER
                    VigoralFor
                    Men










SUPER

                    VigoretteFor
                    Women










S=
UPER
Vigorgel=
For
                    Everyone!









=
All
      Only  $24.99 each & Get FREE Shipping!*-LIMITED
      TIME OFFER-


=
->
      CLICK
      HERE TO ORDER NOW! <-





 



Your
      email address was obtained from an opt-in list, Opt-in MRSA List   Purchase
      Code # 248-3-550.  If you wish to be unsubscribed from thi=
s list, please
      Click
      here and press send to be removed.

# Pipline

In [73]:
def EmailsPreprocessor(sentence):

    Preprocessor_utils = [remove_header,
                      remove_html_tags,
                      to_lower,
                      remove_hyperlink,
                      remove_whitespace,
                      remove_digits,
                      remove_underscores,
                      remove_stopwords,
                      remove_special_characters]

    for tool in Preprocessor_utils:
        sentence = tool(sentence)

    return sentence

X_features = [EmailsPreprocessor(email) for email in X]

print("All dataset preprocessed!!!")

  soup = BeautifulSoup(input, 'html.parser')


All dataset preprocessed!!!


In [74]:
print(X_features[0])

   get  receivers installed  rooms      equipment buy      fist  months free service         channels cd quality sound picture     programming less expensive cable tv markets   you receive free installation dish network  satellite tv system  also upgrade personal digital video recorder    retail value   buy this     click get free installation dish network satellite tv  system  months free service promotion expires    oussama     deathtospamdeathtospamdeathtospam                                                             this sf net email sponsored by thinkgeek welcome geek heaven  oussama  spamassassin sightings mailing list boussaid oussama


In [75]:
print(X_features[100])

new     vigoral herbal love enhancers    new      straight fr  om lab you  we offer    new   special  ly formulated     natural   products help stimulate moments special someone f  or only            th  e most exciting love making experience we ve ever had  vigoral    hands down            nbsp             nb  sp       ricardo   isabelle p  las vegas  nv    font            super                     vigoralfor                     men           super                      vigorettefor                     women           s  uper vigorgel  for                     everyone             all             get free shipping   limited       time offer               click       order now               your       email address obtained opt in list  opt in mrsa list   purchase       code        if wish unsubscribed thi  s list  please       click       press send removed  previously       unsubscribed still receiving message  may email our   abuse       control center  condone spam shape form  tha  n

In [109]:
# create a cleaned datasets to analyse

cleaned_data_dir = {
    'eamil': X_features,
    'target': lables
}

cleaned_data_df = pd.DataFrame(cleaned_data_dir)
cleaned_data_df

Unnamed: 0,eamil,target
0,get receivers installed rooms equipm...,1
1,ba ce b bf bb ea c a ba b b aa b f ...,1
2,oussama sa sinfamily afinet sa sinport htons...,0
3,dear homeowner dear homeowner ...,1
4,tue aug am wintermute wrote rick m...,0
...,...,...
3041,thu aug pm padraig brady wrote yes...,0
3042,biz info available here dear sir madam th...,1
3043,can t predict future always prepare it ...,1
3044,cleanse body naturally feel healthy ...,1


In [110]:
# save cleaned data in csv file
cleaned_data_df.to_csv('Cleaned_data.csv')

# Tokenization and Sequencing

Tokenization is the process of splitting the text into smaller units such as sentences, words or subwords

In [77]:
from sklearn.model_selection import train_test_split

# split the data into stratified training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, lables, test_size=0.2, random_state=42)
print("X_train shape : ",len(X_train))
print("y_train shape : ",len(y_train))
print("X_test shape : ",len(X_test))
print("y_test shape : ",len(y_test))

X_train shape :  2436
y_train shape :  2436
X_test shape :  610
y_test shape :  610


In [119]:
## some config values
embed_size = 100 # how big is each word vector
max_feature = 50000 # get the most frequent words
max_len = 3000 # max number of words in a question to use

In [98]:
from keras.preprocessing.text import Tokenizer

# num_words : I’d like to get the 50000 most frequent words in the corpus
tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts([X_train[0], X_train[100]])

"""
 word_index : map each word with an integer(index) in the corpus(dataset).
 word_docs : how many docs(sentence) the word appear.
 word_counts : the frequency of each word in the entire corpus(dataset).
"""
print("Exemple of corpus :\n",[X_train[0], X_train[100]])


Exemple of corpus :
 [' lockergnome tech specialist                      gnomereport     tips make windows useful  do deal windows daily basis  might well  maximize time it  find tips  will apply operating systems  too  world computing  isn t mysterious understand works   gnometome  begin work faster   smarter   download windows productivity tips now     i lot weekend  really get much done   ya know  knees killing hauling stuff  the basement storage attic  pile  made far kitchen  stripped painted  the floor collection stuff previously  stored  erected shelving i d room  stuff  woke next morning discover sick  ferret  likely fumes leeched next  room despite ventilation setup prevent  migration bad air  so  ended dismantling ferret  cage move upstairs week i m sure  it s safe  thankfully  critters well now   i also started replacing cheapo rusted screws  with stainless steel deck screws back  found  the ones picked weak  leaving stripped  out heads bent shafts  project stopped barely  ge

In [97]:
# Get corpus word index
word_index = tokenizer.word_index

word_index

{'windows': 1,
 'i': 2,
 'the': 3,
 's': 4,
 'oussama': 5,
 'q': 6,
 'tips': 7,
 'latest': 8,
 'system': 9,
 'it': 10,
 'work': 11,
 'much': 12,
 'way': 13,
 'remove': 14,
 'systems': 15,
 'now': 16,
 'get': 17,
 'drive': 18,
 'wd': 19,
 'uninstall': 20,
 'lockergnome': 21,
 'tech': 22,
 'make': 23,
 'well': 24,
 't': 25,
 'really': 26,
 'stuff': 27,
 'm': 28,
 'screws': 29,
 'back': 30,
 'heat': 31,
 'notebook': 32,
 'better': 33,
 'that': 34,
 'play': 35,
 'to': 36,
 'list': 37,
 'error': 38,
 'intel': 39,
 'g': 40,
 'celeron': 41,
 'without': 42,
 'mail': 43,
 'add': 44,
 'boussaid': 45,
 'web': 46,
 'time': 47,
 'download': 48,
 'weekend': 49,
 'basement': 50,
 'room': 51,
 'also': 52,
 'need': 53,
 'two': 54,
 'running': 55,
 'hot': 56,
 'and': 57,
 'a': 58,
 'dvd': 59,
 'watch': 60,
 'save': 61,
 'boxes': 62,
 'letters': 63,
 'in': 64,
 'look': 65,
 'mapped': 66,
 'many': 67,
 'microsoft': 68,
 'blue': 69,
 'provide': 70,
 'chipset': 71,
 'here': 72,
 'macro': 73,
 'style': 74,
 

In [99]:
# Get dictionary of words by number of documents in which they appear
word_docs = tokenizer.word_docs

sorted(word_docs.items())

[('a', 1),
 ('able', 1),
 ('about', 1),
 ('acceptable', 1),
 ('accomplished', 1),
 ('account', 1),
 ('activex', 1),
 ('add', 1),
 ('added', 2),
 ('adding', 1),
 ('address', 1),
 ('advertise', 1),
 ('again', 1),
 ('ago', 1),
 ('air', 1),
 ('alignment', 1),
 ('all', 1),
 ('alone', 1),
 ('along', 1),
 ('alpha', 1),
 ('also', 1),
 ('alternatively', 1),
 ('always', 1),
 ('amd', 1),
 ('among', 1),
 ('ancient', 1),
 ('and', 2),
 ('anglophones', 1),
 ('annoyingly', 1),
 ('anyway', 1),
 ('apple', 1),
 ('applecore', 1),
 ('applescript', 1),
 ('applet', 1),
 ('applications', 1),
 ('apply', 1),
 ('applying', 1),
 ('arms', 1),
 ('around', 1),
 ('as', 1),
 ('aspell', 1),
 ('attempts', 1),
 ('attic', 1),
 ('audio', 1),
 ('august', 1),
 ('author', 1),
 ('available', 1),
 ('avoid', 1),
 ('avr', 1),
 ('away', 1),
 ('back', 1),
 ('bad', 1),
 ('balance', 1),
 ('barely', 1),
 ('basement', 1),
 ('basis', 1),
 ('began', 1),
 ('begin', 1),
 ('believe', 1),
 ('bent', 1),
 ('best', 1),
 ('better', 2),
 ('beware

In [100]:
# Get dictionary of words by frequency of appearance in corpus
word_counts = tokenizer.word_counts

sorted(word_counts.items())

[('a', 3),
 ('able', 1),
 ('about', 1),
 ('acceptable', 2),
 ('accomplished', 1),
 ('account', 1),
 ('activex', 1),
 ('add', 4),
 ('added', 2),
 ('adding', 1),
 ('address', 2),
 ('advertise', 1),
 ('again', 1),
 ('ago', 1),
 ('air', 2),
 ('alignment', 1),
 ('all', 1),
 ('alone', 2),
 ('along', 1),
 ('alpha', 1),
 ('also', 3),
 ('alternatively', 1),
 ('always', 1),
 ('amd', 1),
 ('among', 1),
 ('ancient', 1),
 ('and', 3),
 ('anglophones', 1),
 ('annoyingly', 1),
 ('anyway', 1),
 ('apple', 1),
 ('applecore', 1),
 ('applescript', 1),
 ('applet', 1),
 ('applications', 1),
 ('apply', 1),
 ('applying', 1),
 ('arms', 1),
 ('around', 1),
 ('as', 1),
 ('aspell', 2),
 ('attempts', 1),
 ('attic', 1),
 ('audio', 1),
 ('august', 1),
 ('author', 1),
 ('available', 1),
 ('avoid', 1),
 ('avr', 2),
 ('away', 1),
 ('back', 4),
 ('bad', 2),
 ('balance', 1),
 ('barely', 1),
 ('basement', 3),
 ('basis', 1),
 ('began', 1),
 ('begin', 1),
 ('believe', 1),
 ('bent', 1),
 ('best', 1),
 ('better', 4),
 ('beware

In [114]:
# Tokenize All dataset

tokenizer = Tokenizer(num_words=max_feature)
tokenizer.fit_on_texts(X_train)

# create sequences for LSTM
X_train_features = np.array(tokenizer.texts_to_sequences(X_train))
X_test_features = np.array(tokenizer.texts_to_sequences(X_test))

print(X_train_features[0])

[666, 205, 2219, 4291, 357, 44, 90, 645, 207, 594, 90, 362, 1491, 305, 118, 7526, 26, 20, 83, 357, 260, 489, 787, 284, 595, 113, 1748, 2388, 48, 7527, 816, 297, 6870, 788, 64, 915, 5383, 267, 90, 2248, 357, 66, 10, 404, 2127, 232, 18, 91, 596, 5100, 73, 15038, 5384, 15039, 436, 7, 7528, 1329, 21495, 7529, 196, 713, 9229, 9230, 9231, 7, 5704, 1263, 436, 1403, 2938, 12494, 21496, 10, 2, 1781, 436, 12495, 235, 1350, 1158, 3948, 15040, 1088, 21497, 21498, 235, 1781, 1595, 21499, 817, 1856, 8214, 493, 1809, 167, 5101, 21500, 15040, 21501, 782, 15041, 185, 10, 45, 265, 20, 4, 941, 7530, 10348, 118, 66, 10, 49, 646, 4870, 8215, 21502, 8216, 95, 15042, 4871, 4663, 8216, 105, 318, 7, 1204, 3212, 8217, 2482, 9230, 224, 3949, 10349, 21503, 832, 2340, 4872, 401, 646, 1384, 1577, 49, 69, 15043, 2389, 8216, 196, 91, 12496, 8218, 1782, 577, 843, 949, 169, 15044, 29, 84, 75, 6871, 318, 823, 8216, 2341, 91, 3213, 437, 170, 306, 21504, 1577, 8219, 3214, 522, 422, 1031, 118, 1722, 1000, 64, 1290, 1171, 1

  X_train_features = np.array(tokenizer.texts_to_sequences(X_train))
  X_test_features = np.array(tokenizer.texts_to_sequences(X_test))


# Padding

Convert all of the sentences to the same length.

In [120]:
from keras.preprocessing.sequence import pad_sequences

X_train_features = pad_sequences(X_train_features,maxlen=max_len)
x_test_features = pad_sequences(X_test_features,maxlen=max_len)
print(len(X_train_features[0]))
print(len(X_train_features[1]))

3000
3000


# **Modeling**

In [123]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional
from keras.models import Model
from keras import Sequential

In [124]:
# create the model

embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(max_feature, embedding_vecor_length, input_length=max_len))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3000, 32)          1600000   
                                                                 
 bidirectional (Bidirection  (None, 128)               49664     
 al)                                                             
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1651745 (6.30 MB)
Trainable params: 1651745 (6.30 MB)
Non-trainable params: 0 (0.00 Byte)
__________________