## Classifying Emails Using RNN


starts by importing all the nessesary dependencies


In [1]:
import pandas as pd
import numpy as np

In [2]:
import tensorflow as tf

downloaded spam dataset from keras

In [3]:
df = pd.read_csv('spam_dataset.csv')

df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.shape

(5572, 2)

In [5]:
df["Category"].head(7)

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
Name: Category, dtype: object

In [6]:
df["Category"] = [1 if category == "spam" else 0 for category in df['Category']]

In [7]:
df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
emails = df.iloc[:, 1].values

In [9]:
print("rotal number of emails:", len(emails))

rotal number of emails: 5572


In [10]:
emails[0:10]

array(['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
       'Ok lar... Joking wif u oni...',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
       'U dun say so early hor... U c already then say...',
       "Nah I don't think he goes to usf, he lives around here though",
       "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv",
       'Even my brother is not like to speak with me. They treat me like aids patent.',
       "As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune",
       'WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 090617014

In [11]:
labels = df.iloc[:, 0].values

In [12]:
print("Total number of labels: ", len(labels))

Total number of labels:  5572


In [13]:
labels[0:10]

array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1], dtype=int64)

# Tokenization
#### using keras tokenization we create a vocabulatry an the word to number mapping for word tokenization

In [14]:
from keras.preprocessing.text import Tokenizer

In [15]:
test_token = Tokenizer()

In [16]:
test_sentences = ['here is my bag', 'im headed to the market']

In [17]:
test_token.fit_on_texts(test_sentences)

In [18]:
test_token.word_index

{'here': 1,
 'is': 2,
 'my': 3,
 'bag': 4,
 'im': 5,
 'headed': 6,
 'to': 7,
 'the': 8,
 'market': 9}

In [19]:
test_token.index_word

{1: 'here',
 2: 'is',
 3: 'my',
 4: 'bag',
 5: 'im',
 6: 'headed',
 7: 'to',
 8: 'the',
 9: 'market'}

In [20]:
test_sentence = "here to my bag im headed"
test_token_list = test_token.texts_to_sequences([test_sentence])[0]
print(test_token_list)

[1, 7, 3, 4, 5, 6]


In [21]:
VOCAB_SIZE = 30000

In [22]:
UNK_TOK = '<UNK>'

In [23]:
tokenizer = Tokenizer(num_words = VOCAB_SIZE, oov_token=UNK_TOK)

In [24]:
tokenizer.fit_on_texts(emails)

In [25]:
sequences = tokenizer.texts_to_sequences(emails)

In [26]:
print(sequences[0])

[50, 472, 4427, 841, 756, 659, 66, 9, 1328, 89, 124, 352, 1329, 149, 2994, 1330, 68, 59, 4428, 145]


## Padding
#### Here we use Keras ''pad_sequences'' function to pad a smaller sentence , with dummy text or numbers in oreer to improve usage 
###### Note: padding is done on numbers ie: already tookenized sentences ans seen below 

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [28]:
test_seqences = [[200, 40, 67, 56, 22], [23, 10, 90, 856], [200, 45], [300, 89,67]]


In [29]:
pad_sequences(test_seqences, maxlen=5, padding='post', truncating ='post')

array([[200,  40,  67,  56,  22],
       [ 23,  10,  90, 856,   0],
       [200,  45,   0,   0,   0],
       [300,  89,  67,   0,   0]])

#### Actual Padding of tokenized words

In [30]:
MAX_LEN = 32

In [31]:
padded_sequences = np.array(pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post'))

In [32]:
print(padded_sequences[:5])

[[  50  472 4427  841  756  659   66    9 1328   89  124  352 1329  149
  2994 1330   68   59 4428  145    0    0    0    0    0    0    0    0
     0    0    0    0]
 [  47  337 1500  473    7 1941    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [  48  490    9   20    5  797  902    3  176 1942 1106  660 1943 2332
   262 2333   72 1942    3 1944    3  338  490  556  961   74  391  180
   661  392 2995    0]
 [   7  248  151   24  382 2996    7  140  154   58  151    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]
 [1025    2   99  109   70  491    3  962   70 1945  222  113  474    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0]]


## Building The Model

#### import Sequenyila from keras dence and all the model paramitter we will be using the feed forwrd bidirectional architecture and our RNN model

In [33]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, SimpleRNN, Flatten, Dense

In [34]:
model = Sequential()
model.add(Embedding(VOCAB_SIZE, 16, input_length=MAX_LEN))
model.add(Bidirectional(SimpleRNN(64, return_sequences=True)))
model.add(Bidirectional(SimpleRNN(64), merge_mode="concat"))
model.add(Flatten())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation = 'sigmoid'))

### Compile the model sepesifing loss functuon and optimizer variables 

In [35]:
model.compile(optimizer='adam', loss='binary_crossentropy')

In [36]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 16)            480000    
                                                                 
 bidirectional (Bidirectiona  (None, 32, 128)          10368     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              24704     
 nal)                                                            
                                                                 
 flatten (Flatten)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 24)                3096      
                                                                 
 dense_1 (Dense)             (None, 1)                 2

In [37]:
model.fit(padded_sequences, labels, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1a141fa08b0>

## Prediction using the trained model

In [45]:
sample_text = 'Get 100% off on this new Samsong that you won a lottery on!!!!!'
print(sample_text)
sample_sequence = tokenizer.texts_to_sequences([sample_text])[0]
sample_sequence_padded = pad_sequences([sample_sequence], 
                                        maxlen=MAX_LEN,
                                        padding='post',
                                        truncating='post')

Get 100% off on this new Samsong that you won a lottery on!!!!!


In [46]:
pred_prob = model.predict(sample_sequence_padded)
print('Poberbility of being spam:', pred_prob)

Poberbility of being spam: [[0.99991876]]


In [47]:
pred_prob[0]

array([0.99991876], dtype=float32)

In [49]:
print(pred_prob[0][0])

0.99991876
