In [67]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras.metrics import Recall, Precision

import pickle
import tqdm 
import time

In [95]:
sequence_length = 100
embedding_length = 100
test_size = 0.25

batch_size = 64
epochs = 20

label2int = {'ham':0, 'spam':1}
int2label = {0:'ham', 1:'spam'}

In [69]:
df = pd.read_csv('TextFiles/smsspamcollection.tsv', sep='\t')

In [70]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [71]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [72]:
df.isnull().count()

label      5572
message    5572
length     5572
punct      5572
dtype: int64

In [73]:
x = df['message']
y = df['label']
type(x)

pandas.core.series.Series

In [74]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(x)

In [75]:
X = tokenizer.texts_to_sequences(x)

In [76]:
print(X[0])

[49, 471, 4436, 842, 755, 657, 64, 8, 1327, 88, 123, 351, 1328, 148, 2997, 1329, 67, 58, 4437, 144]


In [77]:
X = np.array(X)
y = np.array(y)

  """Entry point for launching an IPython kernel.


In [78]:
X = pad_sequences(X, maxlen=sequence_length)

In [79]:
print(X[0])

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0   49  471 4436  842
  755  657   64    8 1327   88  123  351 1328  148 2997 1329   67   58
 4437  144]


In [80]:
y = [label2int[label] for label in y]
y = to_categorical(y)

In [81]:
print(y[0])

[1. 0.]


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)

In [83]:
def get_embedding_vectors(tokenizer, dim=100):
    embedding_index = {}
    with open(f"glove.6B/glove.6B.{dim}d.txt", encoding='utf8') as f:
        for line in tqdm.tqdm(f, "Reading Glove"):
            values = line.split()
            word = values[0]
            vectors = np.asarray(values[1:], dtype='float32')
            embedding_index[word] = vectors
    
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index)+1, dim))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [93]:
def get_model(tokenizer, lstm_units):
    
    embedding_matrix = get_embedding_vectors(tokenizer)
    model = Sequential()
    model.add(Embedding(len(tokenizer.word_index)+1,
                       embedding_length,
                       weights=[embedding_matrix],
                       trainable=False,
                       input_length=sequence_length))
    model.add(LSTM(lstm_units, recurrent_dropout=0.2))
    model.add(Dropout(0.3))
    model.add(Dense(2, activation='softmax'))
    
    model.compile(optimizer='rmsprop', loss='binary_crossentropy',
                 metrics = ['accuracy'])
    
    model.summary()
    
    return model

In [94]:
model = get_model(tokenizer=tokenizer, lstm_units=128)

Reading Glove: 400000it [00:11, 33798.62it/s]


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 100)          901200    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total params: 1,018,706
Trainable params: 117,506
Non-trainable params: 901,200
_________________________________________________________________


In [86]:
model_chekpoints = ModelCheckpoint('result/spam_classifier_{val_loss:.2f}',
                                   save_best_only=True, verbose=1)
tensorboard = TensorBoard(f"logs/spam_classifier_{time.time()}")

In [96]:
print("X_train.shape: ",X_train.shape)
print("X_test.shape: ",X_test.shape)
print("y_train.shape: ",y_train.shape)
print("X_test.shape: ",y_test.shape)

X_train.shape:  (4179, 100)
X_test.shape:  (1393, 100)
y_train.shape:  (4179, 2)
X_test.shape:  (1393, 2)


In [97]:
model.fit(X_train,y_train, validation_data=(X_test, y_test),
         batch_size=batch_size,epochs=epochs, verbose=1)

Train on 4179 samples, validate on 1393 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.callbacks.History at 0x2420fa0aa48>

In [98]:
result = model.evaluate(X_test,y_test)



In [99]:
loss = result[0]
accuracy = result[1]
# precision = result[2]
# recall = result[3]

In [91]:
print(f"[+] Accuracy: {accuracy*100:.2f}%")
print(f"[+] Precision: {precision*100:.2f}%")
print(f"[+] Recall: {recall*100:.2f}%")

[+] Accuracy: 98.42%
[+] Precision: 98.42%
[+] Recall: 98.42%


In [100]:
text = 'Congratulations! you have won 100,000$ this week, click here to claim.'
sequence = tokenizer.texts_to_sequences([text])
sequence = pad_sequences(sequence, maxlen=sequence_length)
prediction = model.predict(sequence)
int2label[np.argmax(prediction[0])]

'spam'

In [101]:
prediction

array([[0.0386228 , 0.96137714]], dtype=float32)

In [51]:
pickle.dump(tokenizer, open('tokenizer.sav', 'wb'))
pickle.dump(pad_sequences, open('pad_sequence.sav', 'wb'))


TypeError: can't pickle _thread.RLock objects

In [102]:
pickle.dump(model, open('classifier.sav', 'wb'))

In [53]:
from tensorflow.keras import models

In [59]:
import os
filepath = os.path.join('logs/')

In [60]:
models.save_model(model, filepath, save_format='h5')

OSError: Unable to create file (unable to open file: name = 'logs/', errno = 13, error message = 'Permission denied', flags = 13, o_flags = 302)

In [65]:
import joblib

In [66]:
joblib.dump(model, open('classifier.sav', 'wb'))

TypeError: can't pickle _thread.RLock objects

In [116]:
mymodel = Sequential([
    Dense(2, activation='relu')
])
mymodel.compile(optimizer='rmsprop', loss='binary_crossentropy',
                 metrics = ['accuracy'])

In [117]:
x = np.array([[1,2,3,4]])
y = np.array([[2,4,6,8]])
mymodel.fit(x,y,epochs=2)

ValueError: Error when checking target: expected dense_7 to have shape (2,) but got array with shape (4,)