In [157]:
import string
import re
import pandas as pd
import numpy as np
import keras
import nltk
import tensorflow as tf

nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from textblob import Word

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

from keras.datasets import imdb
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from warnings import filterwarnings
filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [158]:
df = pd.read_csv("../input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [159]:
df["text"] = df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))
df["text"] = df["text"].str.replace('[^\w\s]','')
df["text"] = df["text"].str.replace('\d','')
df["text"] = df["text"].apply(lambda x: ' '.join([x for x in x.split() if len(x) > 3]))

In [160]:
stop_words = set(stopwords.words("english"))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

df["text"] = df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))

In [161]:
df["text"] = df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
df["text"] = df["text"].apply(lambda x: " ".join(re.sub(r'http\S+', '', x) for x in x.split()))
df.head()

Unnamed: 0,target,text
0,ham,jurong point crazy available bugis great world...
1,ham,joking
2,spam,free entry wkly comp final tkts text receive e...
3,ham,early already
4,ham,dont think go life around though


In [162]:
test_list = []
for i in range(len(df["text"])):
  test_list.append(df["text"][i])
 
sentiment = df['target']


In [163]:
y = np.array(list(map(lambda x: 1 if x=="spam" else 0, sentiment)))

In [164]:
X_train, X_test,Y_train, Y_test = train_test_split(test_list, y, test_size=0.2, random_state = 45, stratify = y)

In [165]:
len(Y_train)

4457

In [166]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

In [167]:
words_to_index = tokenizer.word_index
len(words_to_index)

6267

In [168]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)



  return word_to_vec_map

In [169]:
word_to_vec_map = read_glove_vector('../input/glove-twitter/glove.twitter.27B.100d.txt')

In [170]:
maxLen = 150

In [171]:
vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [172]:
def ham_spam(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = LSTM(128, return_sequences=True)(embeddings)

  X = Dropout(0.6)(X)

  X = LSTM(128, return_sequences=True)(X)

  X = Dropout(0.6)(X)

  X = LSTM(128)(X)

  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model

In [173]:
def conv1d_model(input_shape):

  X_indices = Input(input_shape)

  embeddings = embedding_layer(X_indices)

  X = Conv1D(512,3,activation='relu')(embeddings)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  
  X = MaxPooling1D(3)(X)

  X = Conv1D(256,3,activation='relu')(X)
  X = Dropout(0.8)(X)
  X = MaxPooling1D(3)(X)

  X = GlobalMaxPooling1D()(X)

  X = Dense(256, activation='relu')(X)
  X = Dense(1, activation='sigmoid')(X)

  model = Model(inputs=X_indices, outputs=X)

  return model


In [174]:
model = ham_spam((maxLen,))
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        [(None, 150)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 100)          626700    
_________________________________________________________________
lstm_15 (LSTM)               (None, 150, 128)          117248    
_________________________________________________________________
dropout_11 (Dropout)         (None, 150, 128)          0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 150, 128)          131584    
_________________________________________________________________
dropout_12 (Dropout)         (None, 150, 128)          0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 128)               1315

In [175]:
model_1d = conv1d_model((maxLen,))
model_1d.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 150)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 150, 100)          626700    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 148, 512)          154112    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 49, 512)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 47, 256)           393472    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 15, 256)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 13, 256)           1968

In [176]:
X_train_indices = tokenizer.texts_to_sequences(X_train)

In [177]:
X_train_indices = pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
X_train_indices.shape

(4457, 150)

In [178]:

model_1d.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [179]:
model_1d.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f1343829490>

In [180]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [181]:
model.fit(X_train_indices, Y_train, batch_size=64, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f1306b526d0>

In [182]:
X_test_indices = tokenizer.texts_to_sequences(X_test)

X_test_indices = pad_sequences(X_test_indices, maxlen=maxLen, padding='post')

In [183]:
model.evaluate(X_test_indices, Y_test)



[0.11409913748502731, 0.9605380892753601]

In [184]:
model_1d.evaluate(X_test_indices, Y_test)



[0.1550135463476181, 0.9533632397651672]

In [185]:
preds = model_1d.predict(X_test_indices)

In [186]:
n = np.random.randint(0,4458)

X_test[n]

'olowoyey uscedu great time argentina secretary everything blessing'

In [187]:
if preds[n] > 0.5:
  print('predicted target : spam')
else: 
  print('precicted target : ham')

if (Y_test[n] == 1):
  print('correct target : spam')
else:
  print('correct target : ham')

precicted target : ham
correct target : ham


In [188]:
preds[n]

array([6.627117e-05], dtype=float32)

In [189]:
Y_test[n]

0

In [190]:
model_1d.save('model1d.h5')

In [191]:
test_list_idx = tokenizer.texts_to_sequences(test_list)

In [192]:
def add_score_predictions(df, test_list_idx):

  df['target score'] = 0

  test_list_idx = pad_sequences(test_list_idx, maxlen=maxLen, padding='post')

  test_preds = model.predict(test_list_idx)

  df['test score'] = test_preds

  pred_sentiment = np.array(list(map(lambda x : 'spam' if x > 0.5 else 'ham',test_preds)))

  df['test sentiment'] = 0

  df['test sentiment'] = pred_sentiment

  return df

In [193]:
df = add_score_predictions(df, test_list_idx)

In [194]:
df[df["target"]=='spam']

Unnamed: 0,target,text,target score,test score,test sentiment
2,spam,free entry wkly comp final tkts text receive e...,0,0.937970,spam
5,spam,freemsg darling week word back like still chgs...,0,0.006345,ham
8,spam,winner valued network customer selected receiv...,0,0.939470,spam
9,spam,mobile month entitled update latest colour mob...,0,0.939648,spam
11,spam,chance cash pound send cost pday day tsandcs a...,0,0.925860,spam
...,...,...,...,...,...
5537,spam,want explicit sec ring cost pmin gsex pobox,0,0.080773,ham
5540,spam,asked mobile chatlines inclu free min india cu...,0,0.920094,spam
5547,spam,contract mobile mnths latest motorola nokia fr...,0,0.939715,spam
5566,spam,reminder pound free call credit detail great o...,0,0.938488,spam


In [195]:
from sklearn.metrics import classification_report
classes_pred = np.argmax(preds, axis = 1)

print(classification_report(Y_test, classes_pred))

              precision    recall  f1-score   support

           0       0.87      1.00      0.93       966
           1       0.00      0.00      0.00       149

    accuracy                           0.87      1115
   macro avg       0.43      0.50      0.46      1115
weighted avg       0.75      0.87      0.80      1115

