Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from keras.preprocessing.text import text_to_word_sequence

Using TensorFlow backend.


In [2]:
data1 = pd.read_csv('spam.csv', encoding = 'latin-1') # Kaggle dataset
data1 = data1.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1)
print(data1.head())

data2 = pd.ExcelFile('revisedindiandataset.xls')
data2 = data2.parse(0)
data2 = data2.drop(['code'], axis = 1)
print(data2.head())

data = pd.concat([data1, data2])

  label                                                msg
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
  label                                                msg
0   ham  Dear Customer, +916300623587 is now available ...
1   ham  Dear Customer, You have a missed call from +91...
2  spam  Join Hike to get Rs 40. Earn upto Rs. 10,000 b...
3  spam  Just sent you some money and invited you to Hi...
4  spam  Just sent you some money and invited you to Hi...


# Dividing sentences for word2vec training

In [3]:
sentences = []
pro_data = []
for row in data.itertuples():
    rev = str(row[2])
    words = text_to_word_sequence(rev)
    
    if(len(words) == 0):
        pro_data.append(words)
        continue
    
    if(len(words) == 1):
        te = ['</s>', words[0], '</e>']
        sentences.append(te)
        pro_data.append(words)
        continue
    
    te = ['</s>', words[0], words[1]]
    sentences.append(te)
    
    for i in range(1,len(words) - 1):
        te = [words[i-1], words[i], words[i+1]]
        sentences.append(te)
        
    te = [words[len(words) - 2], words[len(words) - 1], '</e>']
    sentences.append(te)   
    
    pro_data.append(words)

In [4]:
print(data['msg'].iloc[0])
print(sentences[0])
print(sentences[1])

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
['</s>', 'go', 'until']
['go', 'until', 'jurong']


# word2vec training

In [5]:
size = 150
from gensim.models import Word2Vec
model = Word2Vec(sentences, min_count=1, size = size)



In [6]:
model['god']

  """Entry point for launching an IPython kernel.


array([-0.16490524, -0.12317795, -0.20400028,  0.50917351, -0.06683756,
        0.29778257,  0.01752055, -0.1428301 ,  0.20175029, -0.01860907,
       -0.06080187, -0.14596462,  0.02184702,  0.22056867, -0.31917828,
        0.00341994,  0.05246627, -0.17653848, -0.15657805,  0.05114596,
        0.30727157,  0.05585392, -0.23821613,  0.34458381, -0.61221409,
        0.09606561, -0.06882115, -0.27109337, -0.19593148, -0.0268448 ,
       -0.1761073 , -0.21702495,  0.19700979,  0.01971228,  0.47714841,
        0.03118076, -0.24821253,  0.16127042, -0.27863744,  0.2055781 ,
        0.05842168,  0.0051453 , -0.01437454, -0.08824436,  0.22952513,
        0.02204015, -0.01527987, -0.39313459,  0.16025591,  0.18031368,
       -0.09067744, -0.07197399, -0.05545877,  0.17090571, -0.30757448,
       -0.49316847, -0.24528784, -0.209005  , -0.22037736,  0.08996142,
       -0.01346877,  0.39833558,  0.16763233,  0.25513238,  0.16989066,
        0.09064545, -0.17535852,  0.10748672, -0.07899807, -0.14

# Pre - padding

In [6]:
data_vec = []
msg_limit = 200
for i in range(len(pro_data)):
    empty = [0 for i in range(size)]
    temp = [[0 for i in range(size)] for j in range(msg_limit)]
    for j in range(len(pro_data[i])-1, -1, -1):
        temp[199 - (len(pro_data[i])-1-j)] = model[pro_data[i][j]]
    data_vec.append(temp)
data_vec = np.array(data_vec)

  import sys


splitting data for training and testing purposes

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    data_vec, data['label'], test_size=0.30, random_state=0)

Y_train = pd.get_dummies(Y_train).values

# Model

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout

classifier = Sequential()
classifier.add(LSTM(200, input_shape =(200, size)))
classifier.add(Dropout(0.2))
classifier.add(Dense(2, activation='softmax'))
classifier.compile(
    loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(classifier.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 200)               280800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 402       
Total params: 281,202
Trainable params: 281,202
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
classifier.fit(X_train, Y_train, epochs = 7, batch_size = 50) # training the model

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x122ed6304e0>

# Evaluating Accuracy

In [11]:
Y_pr = classifier.predict(X_test)

Y_pred = []

for i in range(len(Y_pr)):
    if Y_pr[i][0] >= Y_pr[i][1]:
        Y_pred.append('ham')
    else:
        Y_pred.append('spam')

In [12]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred)  
print(cm)

[[2398   40]
 [  61  543]]
