In [0]:
!pip install -U -q PyDrive

[?25l[K    1% |▎                               | 10kB 17.2MB/s eta 0:00:01[K    2% |▋                               | 20kB 1.6MB/s eta 0:00:01[K    3% |█                               | 30kB 2.4MB/s eta 0:00:01[K    4% |█▎                              | 40kB 1.7MB/s eta 0:00:01[K    5% |█▋                              | 51kB 2.0MB/s eta 0:00:01[K    6% |██                              | 61kB 2.4MB/s eta 0:00:01[K    7% |██▎                             | 71kB 2.8MB/s eta 0:00:01[K    8% |██▋                             | 81kB 3.2MB/s eta 0:00:01[K    9% |███                             | 92kB 2.5MB/s eta 0:00:01[K    10% |███▎                            | 102kB 2.7MB/s eta 0:00:01[K    11% |███▋                            | 112kB 2.8MB/s eta 0:00:01[K    12% |████                            | 122kB 4.0MB/s eta 0:00:01[K    13% |████▎                           | 133kB 4.0MB/s eta 0:00:01[K    14% |████▋                           | 143kB 7.4MB/s eta 0:00:01[

In [0]:
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, CuDNNGRU, Embedding, Dropout, CuDNNLSTM, SimpleRNN, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

np.random.seed(1)
tf.set_random_seed(1)

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
download_path = os.path.expanduser('~/data')
try:
  os.makedirs(download_path)
except: pass

output_file = os.path.join(download_path)

file_list = drive.ListFile(
    {'q': "'1B6wmxiZt8NHWnRe4gsZD5yYTrevTyZim' in parents"}).GetList()

for f in file_list:
  print('title: %s, id: %s' % (f['title'], f['id']))
  fname = os.path.join(download_path, f['title'])
  print('downloading to {}'.format(fname))
  f_ = drive.CreateFile({'id': f['id']})
  f_.GetContentFile(fname)

title: smsspamcollection.zip, id: 1kNI0uCWaJPqEeDqRkNlmh5jX7QzQHzLo
downloading to /root/data/smsspamcollection.zip
title: SMSSpamCollection.txt, id: 1p8nsjPoR5DKjMSKK_EobFYYpavSxU5sJ
downloading to /root/data/SMSSpamCollection.txt


In [0]:
output_file = '/root/data/SMSSpamCollection.txt'

with open(output_file) as f:
    mylist = f.read().splitlines()
    
#mylist

In [0]:
def text_to_df(list):
    result = pd.DataFrame(columns=['text', 'spam'])
    
    for text in list:
        if (text.find('ham\t') == 0):
            df = pd.DataFrame([[text[4:], 0]], columns=['text', 'spam'])
            result = result.append(df)
            
        elif (text.find('spam\t') == 0):
            df = pd.DataFrame([[text[5:], 1]], columns=['text', 'spam'])
            result = result.append(df)
        
    return result

In [0]:
dataset = text_to_df(mylist)

dataset.head()

Unnamed: 0,text,spam
0,"Go until jurong point, crazy.. Available only ...",0
0,Ok lar... Joking wif u oni...,0
0,Free entry in 2 a wkly comp to win FA Cup fina...,1
0,U dun say so early hor... U c already then say...,0
0,"Nah I don't think he goes to usf, he lives aro...",0


In [0]:
dataset.spam.value_counts()

0    4827
1     747
Name: spam, dtype: int64

In [0]:
x_data, y_data = dataset.text.values, dataset.spam.values

print(len(x_data))
print(len(y_data))

5574
5574


In [0]:
# Tokenizing word
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(x_data)

if num_words is None:
    num_words = len(tokenizer.word_index)
    print(num_words)

In [0]:
# Print Token words
#tokenizer.word_index
#test_df = pd.tokenizer.word_index

In [0]:
# Get tokenize to data
x_tokens = tokenizer.texts_to_sequences(x_data)

In [0]:
# Print sample of data
print(x_data[0])
print(np.array(x_tokens[0]))

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
[  49  471 4435  842  755  658   64    8 1327   88  123  351 1328  148
 2996 1329   67   58 4436  144]


In [0]:
# Analysis for best shape in data
num_tokens = [len(tokens) for tokens in x_data]
num_tokens = np.array(num_tokens)

print(np.mean(num_tokens))
print(np.max(num_tokens))

80.47829207032652
910


In [0]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

200

In [0]:
np.sum(num_tokens < max_tokens) / len(num_tokens)

0.9795479009687836

In [0]:
x_data_pad = pad_sequences(x_tokens, maxlen=max_tokens, padding='pre', truncating='post')
#x_data_pad = x_data_pad / num_words

x_data_pad.shape

(5574, 200)

In [0]:
np.array(x_tokens[0])

array([  49,  471, 4435,  842,  755,  658,   64,    8, 1327,   88,  123,
        351, 1328,  148, 2996, 1329,   67,   58, 4436,  144])

In [0]:
x_data_pad[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [0]:
# Train Test Split by line
x_train, y_train = x_data_pad[0:1674], y_data[0:1674]
x_test, y_test = x_data_pad[1674:], y_data[1674:]

In [0]:
def spam_caught_score(model):
  y_pred = model.predict_classes(x_test)

  all_spam = 0
  spam_found = 0

  for i in range(len(y_test)):
    if y_test[i] == 1:
      all_spam += 1
      if y_pred[i][0] == y_test[i]:
        spam_found += 1

  print('Number of spam : ' + str(all_spam))
  print('Number of spam caught : ' + str(spam_found))
  print(spam_found / all_spam)

In [0]:
def block_ham_score(model):
  y_pred = model.predict_classes(x_test)

  all_ham = 0
  block_ham_found = 0

  for i in range(len(y_test)):
    if y_test[i] == 0:
      all_ham += 1
      if y_pred[i][0] != y_test[i]:
        block_ham_found += 1

  print('Number of ham : ' + str(all_ham))
  print('Number of blocked ham : ' + str(block_ham_found))
  print(block_ham_found / all_ham)

In [0]:
# Define parameter
embedding_size = 32
EPOCHS = 10
batch_size = None

In [0]:
x_train.shape

(1674, 200)

In [0]:
print(num_words)
print(embedding_size)
print(max_tokens)

10000
32
200


In [0]:
# GRU
model_gru = Sequential()

#model_gru.add(Embedding(input_dim=num_words, output_dim=embedding_size, input_length=max_tokens))

model_gru.add(CuDNNGRU(units=embedding_size, return_sequences=True))

model_gru.add(CuDNNGRU(units=max_tokens, return_sequences=True))

model_gru.add(CuDNNGRU(units=embedding_size))
model_gru.add(Dropout(0.2))

model_gru.add(Dense(embedding_size/2, activation='relu'))

model_gru.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=0.01)

model_gru.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model_gru.summary()

ValueError: ignored

In [0]:
%%time
model_gru.fit(x_train, y_train, epochs=EPOCHS, batch_size=batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29.2 s, sys: 7.59 s, total: 36.7 s
Wall time: 36.1 s


<tensorflow.python.keras.callbacks.History at 0x7f413105ee80>

In [0]:
model_gru.evaluate(x_test, y_test)



[0.12697152848956952, 0.9802564102564103]

In [0]:
spam_caught_score(model_gru)
block_ham_score(model_gru)

Number of spam : 509
Number of spam caught : 439
0.862475442043222
Number of ham : 3391
Number of blocked ham : 7
0.0020642878207018578


In [0]:
# LSTM
model_lstm = Sequential()

model_lstm.add(Embedding(input_dim=num_words, output_dim=embedding_size, input_length=max_tokens))

model_lstm.add(CuDNNLSTM(units=embedding_size, return_sequences=True))

#model_lstm.add(CuDNNLSTM(units=max_tokens, return_sequences=True))

model_lstm.add(CuDNNLSTM(units=embedding_size))
model_lstm.add(Dropout(0.2))

model_lstm.add(Dense(embedding_size/2, activation='relu'))
model_lstm.add(Dense(1, activation='sigmoid'))

optimizer = Adam(lr=0.01)

model_lstm.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 32)           320000    
_________________________________________________________________
cu_dnnlstm (CuDNNLSTM)       (None, 200, 32)           8448      
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 32)                8448      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 337,441
Trainable params: 337,441
Non-trainable params: 0
_________________________________________________________________


In [0]:
%%time
model_lstm.fit(x_train, y_train, epochs=EPOCHS, batch_size=batch_size)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

In [0]:
model_lstm.evaluate(x_test, y_test)



[0.1887233146644865, 0.9815384615384616]

In [0]:
spam_caught_score(model_lstm)
block_ham_score(model_lstm)

Number of spam : 509
Number of spam caught : 447
0.8781925343811395
Number of ham : 3391
Number of blocked ham : 10
0.002948982601002654
