In [1]:
import pandas as pd
import re
import string
import numpy as np
from sklearn.feature_extraction import _stop_words

In [2]:
data = pd.read_csv("data/spamham.csv")

In [3]:
data.columns = ['spam', 'text']

In [4]:
#data = data[['text','spam']]

In [5]:
data.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Splitting data

In [6]:
from sklearn.model_selection import train_test_split
emails_train, emails_test, target_train, target_test = train_test_split(data.text,data.spam,test_size = 0.2) 

In [7]:
data.info

<bound method DataFrame.info of       spam                                               text
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [8]:
emails_train.shape

(4457,)

# Preprocessing

In [9]:
def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)

def to_lower(word):
    result = word.lower()
    return result

def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result

def remove_punctuation(word):
    result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return result

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n','')



def clean_up_pipeline(sentence):
    cleaning_utils = [remove_hyperlink,
                      replace_newline,
                      to_lower,
                      remove_number,
                      remove_punctuation,remove_whitespace]
    for o in cleaning_utils:
        sentence = o(sentence)
    return sentence

x_train = [clean_up_pipeline(o) for o in emails_train]
x_test = [clean_up_pipeline(o) for o in emails_test]

x_train[0]

'darren was saying dat if u meeting da ge den we dun meet  dinner cos later u leave xy will feel awkward den u meet him  lunch lor'

In [10]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train_y = le.fit_transform(target_train.values)
test_y = le.transform(target_test.values)

In [11]:
train_y

array([0, 0, 0, ..., 1, 0, 0])

# Tokenize

In [12]:
## some config values 
embed_size = 100 # how big is each word vector
max_feature = 50000 # how many unique words to use (i.e num rows in embedding vector)
max_len = 2000 # max number of words in a question to use

In [13]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_feature)

tokenizer.fit_on_texts(x_train)

x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
x_test_features = np.array(tokenizer.texts_to_sequences(x_test))

x_train_features[:10]

  x_train_features = np.array(tokenizer.texts_to_sequences(x_train))
  x_test_features = np.array(tokenizer.texts_to_sequences(x_test))


array([list([740, 59, 560, 315, 31, 6, 255, 94, 1035, 369, 37, 239, 179, 329, 160, 95, 6, 228, 941, 27, 200, 3482, 369, 6, 179, 124, 262, 74]),
       list([136, 10, 4, 1529, 2, 92, 92, 77, 14]),
       list([269, 543, 34, 68, 8, 4, 587]),
       list([269, 543, 182, 100, 2, 6, 4, 741, 544, 111, 3483, 229]),
       list([4, 316, 256, 20, 32, 5, 516, 22, 32, 5, 1845, 35, 1846, 7, 3484]),
       list([270, 107, 25, 1, 2389, 1847, 7, 3485, 7, 351, 475, 36, 295, 1, 257, 287, 18, 2390, 25, 1, 330, 13, 207, 9, 1332, 438, 19, 14, 44, 1530]),
       list([32, 53, 64, 130, 1, 60, 140]),
       list([80, 22, 230, 41, 21, 200, 144, 338, 1333, 3486, 19, 942, 10, 208]),
       list([99, 35, 13, 231, 7, 517, 7, 3, 27, 271, 71, 647, 4, 588, 864, 48, 3487, 32, 1848, 3488, 1531, 3489, 1849]),
       list([80, 55, 16, 95])], dtype=object)

# Padding

In [14]:
from keras.preprocessing.sequence import pad_sequences
x_train_features = pad_sequences(x_train_features,maxlen=max_len)
x_test_features = pad_sequences(x_test_features,maxlen=max_len)
x_train_features[:10]

array([[   0,    0,    0, ...,  124,  262,   74],
       [   0,    0,    0, ...,   92,   77,   14],
       [   0,    0,    0, ...,    8,    4,  587],
       ...,
       [   0,    0,    0, ...,  942,   10,  208],
       [   0,    0,    0, ..., 1531, 3489, 1849],
       [   0,    0,    0, ...,   55,   16,   95]])

# Model

In [15]:
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional
from keras.models import Model

In [16]:
# create the model
import tensorflow as tf
embedding_vecor_length = 32

model = tf.keras.Sequential()
model.add(Embedding(max_feature, embedding_vecor_length, input_length=max_len))
model.add(Bidirectional(tf.keras.layers.LSTM(64)))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2000, 32)          1600000   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              49664     
 l)                                                              
                                                                 
 dense (Dense)               (None, 16)                2064      
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 1,651,745
Trainable params: 1,651,745
Non-trainable params: 0
______________________________________________

In [17]:
history = model.fit(x_train_features, train_y, batch_size=128, epochs=10, validation_data=(x_test_features, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
from sklearn.metrics import confusion_matrix,f1_score, precision_score,recall_score

In [20]:
y_predict  = [1 if o>0.5 else 0 for o in model.predict(x_test_features)]

In [21]:
cf_matrix =confusion_matrix(test_y,y_predict)

In [22]:
tn, fp, fn, tp = confusion_matrix(test_y,y_predict).ravel()

In [24]:
print("Precision: {:.2f}%".format(100 * precision_score(test_y, y_predict)))
print("Recall: {:.2f}%".format(100 * recall_score(test_y, y_predict)))
print("F1 Score: {:.2f}%".format(100 * f1_score(test_y,y_predict)))

Precision: 98.56%
Recall: 88.39%
F1 Score: 93.20%


In [25]:
f1_score(test_y,y_predict)

0.9319727891156463

In [32]:
import pickle

In [37]:
#from keras.models import load_model
#model.save('model.h5')

model_final = load_model('model.h5')

In [39]:
text = np.array(tokenizer.texts_to_sequences(["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]))

In [40]:
text

array([[ 629,   76,    4,  768,  428,  216,    3,   17,  100,  425,    2,
        7662,  143,  966,    2,  125,   16,  125,  426,  511,  510,   65]])

In [41]:
test_features = pad_sequences(text,maxlen=max_len)
test_features

array([[  0,   0,   0, ..., 511, 510,  65]])

In [42]:
y_predict  = [1 if o>0.5 else 0 for o in model_final.predict(test_features)]
y_predict

[1]

In [45]:
def predict(model, sample_mail):
    
    pred_to_label = {0: 'Ham', 1: 'Spam'}
    
    text = np.array(tokenizer.texts_to_sequences([sample_mail]))
    test_features = pad_sequences(text,maxlen=max_len)
    
    y_predict  = [1 if o>0.5 else 0 for o in model.predict(test_features)]
    

    data = []
    for mail, pred in zip(sample_mail, y_predict):
        data.append((mail, pred, pred_to_label[pred]))

    return data


if __name__=="__main__":
    # Text to classify should be in a list.
    
    sample_mail = ["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]

    
    predictions = predict(model_final, sample_mail)
    print(predictions)

[('WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.', 0, 'Ham')]
