In [41]:
pip install -q kaggle

In [1]:
# from google.colab import files
# files.upload()

In [43]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [44]:
!kaggle datasets download -d uciml/sms-spam-collection-dataset

sms-spam-collection-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [45]:
!unzip sms-spam-collection-dataset.zip

Archive:  sms-spam-collection-dataset.zip
replace spam.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: spam.csv                


In [46]:
import numpy as np 
import pandas as pd 
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import random
import pickle
import json

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score,confusion_matrix

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [47]:
data = pd.read_csv('spam.csv', encoding = 'latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [48]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [49]:
data['v1'] = data['v1'].replace({'ham': 0, 'spam': 1})

In [50]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,,,
5568,0,Will Ì_ b going to esplanade fr home?,,,
5569,0,"Pity, * was in mood for that. So...any other s...",,,
5570,0,The guy did some bitching but I acted like i'd...,,,


In [51]:
stopwords = nltk.corpus.stopwords.words('english')

In [52]:
for i in range(len(data)):
  word_tokens = nltk.tokenize.word_tokenize(data.iloc[i]['v2'])
  filtered_sentence = [w.lower() for w in word_tokens if w.isalpha()]
  filtered_sentence = [w for w in filtered_sentence if not w in stopwords]
  data.at[i, 'v2'] = ' '.join(filtered_sentence)

In [53]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,go jurong point available bugis n great world ...,,,
1,0,ok lar joking wif u oni,,,
2,1,free entry wkly comp win fa cup final tkts may...,,,
3,0,u dun say early hor u c already say,,,
4,0,nah think goes usf lives around though,,,
...,...,...,...,...,...
5567,1,time tried contact u pound prize claim easy ca...,,,
5568,0,b going esplanade fr home,,,
5569,0,pity mood suggestions,,,
5570,0,guy bitching acted like interested buying some...,,,


In [54]:
X_train, X_test, y_train, y_test = train_test_split(data['v2'].to_numpy(), 
                                                    data['v1'].to_numpy(), 
                                                    test_size=0.20, 
                                                    random_state=42)

In [55]:
VOCAB_SIZE = 2000
EMBEDDING_DIM = 128
MAX_LENGTH = 100
TRUNC_TYPE = 'post'
PADDING_TYPE = 'post'
OOV_TOK = "<OOV>"

In [56]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE,
                                                  lower=True, 
                                                  split=' ',
                                                  oov_token=OOV_TOK)

tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

In [57]:
train_sequences = tokenizer.texts_to_sequences(X_train)

In [58]:
train_sequences[0:5]

[[1611, 27, 1334, 206, 187, 232, 1612],
 [607, 1, 1335, 1, 86, 1613, 1, 1614, 1],
 [32, 1143, 562, 223, 1, 1],
 [1615, 1, 1, 6, 7, 1],
 [608, 261, 609, 262, 487, 3, 656, 295, 610]]

In [59]:
train_padded = pad_sequences(train_sequences,
                             maxlen=MAX_LENGTH, 
                             padding=PADDING_TYPE, 
                             truncating=TRUNC_TYPE)

In [60]:
train_padded[0]

array([1611,   27, 1334,  206,  187,  232, 1612,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0], dtype=int32)

In [61]:
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences,
                            maxlen=MAX_LENGTH, 
                            padding=PADDING_TYPE, 
                            truncating=TRUNC_TYPE)

In [62]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length=MAX_LENGTH),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, activation='tanh', return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, recurrent_dropout=0.2, return_sequences=True)),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

history = model.fit(train_padded, y_train, epochs=10, validation_data=(test_padded, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [63]:
model.evaluate(test_padded, y_test)



[0.1536809206008911, 0.9811659455299377]

In [132]:
rand_sent = random.choice(data['v2'])
print('Sentence --> ', rand_sent)

pred_sequences = tokenizer.texts_to_sequences([rand_sent])
print('tokenized Sentence -->', pred_sequences)

pred_padded = pad_sequences(pred_sequences,
                            maxlen=MAX_LENGTH, 
                            padding=PADDING_TYPE, 
                            truncating=TRUNC_TYPE)
print('Tokenized and Padded Sentence -->', pred_padded)

prob = model.predict(pred_padded)
prob = np.squeeze(prob)

if prob > 0.8:
  print('It is a Spam')
else:
  print('It is not a Spam')

Sentence -->  ur currently pounds maximize ur send collect cc po box
tokenized Sentence --> [[4, 682, 465, 1112, 4, 17, 326, 1113, 256, 210]]
Tokenized and Padded Sentence --> [[   4  682  465 1112    4   17  326 1113  256  210    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]
It is a Spam


In [71]:
model.save('Model.h5')

In [72]:
import io

tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [None]:
# with open('tokenizer.json') as f:
#     data = json.load(f)
#     tokenizer = tokenizer_from_json(data)