In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Sequential
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline

In [29]:
df = pd.read_csv('spam.csv',encoding='latin-1')
df.head()

Unnamed: 0,Text,Label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [30]:
X = df.Text
Y = df.Label
# Y = LabelEncoder().fit_transform(Y)
Y = Y.values.reshape(-1,1)
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1)

In [32]:
# Allow only 1000 most frequent words in the whole corpus
max_words = 1000
to_token = Tokenizer(num_words=max_words)
to_token.fit_on_texts(X_train)

# Pad every message to 150 words max!
max_len = 150
sequences = to_token.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [33]:
def RNN():
    '''Define RNN model in keras layer-by-layer'''
    # model = Sequential()
    # model.add()
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [34]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_1 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
__________

In [35]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])

Train on 4011 samples, validate on 1003 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<keras.callbacks.History at 0x278808ef438>

In [37]:
test_sequences = to_token.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
output = model.predict(test_sequences_matrix)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.022
  Accuracy: 0.991


In [38]:
df2 = pd.DataFrame(X_test)
df2['prediction'] = output
print (df2.sort_values('prediction', ascending=False))

                                                   Text  prediction
938   Urgent! call 09061749602 from Landline. Your c...    0.999998
3629  Get the official ENGLAND poly ringtone or colo...    0.999998
2728  Urgent Please call 09066612661 from landline. ...    0.999998
4965  URGENT! We are trying to contact U. Todays dra...    0.999998
3215  URGENT! We are trying to contact U. Todays dra...    0.999998
2908  URGENT! Your Mobile number has been awarded wi...    0.999998
5342  u r subscribed 2 TEXTCOMP 250 wkly comp. 1st w...    0.999998
4181  Urgent! Please call 0906346330. Your ABTA comp...    0.999997
843   Urgent! call 09066350750 from your landline. Y...    0.999997
2307  WIN a year supply of CDs 4 a store of ur choic...    0.999997
812   Congratulations ur awarded either å£500 of CD ...    0.999997
249   Congratulations ur awarded 500 of CD vouchers ...    0.999996
4497  Latest Nokia Mobile or iPOD MP3 Player +å£400 ...    0.999996
526   Today's Offer! Claim ur å£150 worth of dis