In [61]:
#Spam Detection Model using CNN
import tensorflow as tf
tf.__version__

from tensorflow.keras.layers import Dense,LSTM,Input,Conv1D,Embedding,MaxPooling1D,GlobalMaxPooling1D

from tensorflow.keras.models import Sequential

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error


In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [63]:
data = pd.read_csv('./Datasets/SMSSpamCollection', sep="\t", names = ['label','message'])

In [64]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [65]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [66]:
#Binarize labels
data['label']=data['label'].map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [67]:
#Seperate data as features and label
features = data.message.values
label = data.label.values

In [68]:
#Train test split
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state = 1)

In [69]:
# For Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

#Decide the Vocabulary Word Frequency Size

vocabFreqWordSize = None

# If the word 'spiderman' exists a minimum of num_words times in the corpus,
# then it will be added in the vocab dictionary

#Convert the sentences into sequence of words

tokenizer = Tokenizer(num_words=vocabFreqWordSize)

#Fit the training data

tokenizer.fit_on_texts(X_train)




In [70]:
#Lets create the sequence objects

seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)

In [71]:
X_train[0]

"Hi , where are you? We're at  and they're not keen to go out i kind of am but feel i shouldn't so can we go out tomo, don't mind do you?"

In [72]:
len(seq_train[3])

34

In [73]:
len(seq_train)

4457

In [74]:
#Lets pad the sequence data
#This creates a matrix where the
# rows = Documents
# Cols = Time Steps (For our understanding its the vocab with padding)

from tensorflow.keras.preprocessing.sequence import pad_sequences

trainData = pad_sequences(seq_train)
T = trainData.shape[1]
T

189

In [75]:
testData = pad_sequences(seq_test , maxlen=T)
testData.shape

(1115, 189)

In [76]:
#Build the Model
#Question: Can we use ANN?
#Answer: We can use ANN if the dataset is small. BUt preferred is CNN due to data size and avoiding memory overflow issue
# and without compromising the quality of the data.

In [77]:
#Get the Vocab Size of the Tokenizer
V = len(tokenizer.word_index)

In [78]:
#CNN for Text data -- Conv1D



from tensorflow.keras.layers import Dense,Input,Dropout,Conv1D,Embedding,MaxPooling1D,GlobalMaxPooling1D,GlobalAveragePooling1D
from tensorflow.keras.models import Model

# 1. Create/Convert Seq data into Word Embedding / Embedded Data

#Input layer -  It takes in sequence of Integers i.e. T  (time step size ////// vocab size)

i = Input(shape=(T,))

#Create WordEmbedding ---- This layer will take sequence of integers and return sequence of word vectors
# Ideally input size is No of Docs X Time Steps
# When creating word embedding the dimension of matrix will be (Total_Vocabulary_Size + 1,Embedding Dim)
# Reason for + 1 is indexing of embedding starts from 1 and not 0


#You can decide the Embedding Dimensionality -- Hyperparameter
#Prashant Recommend ----> 10 to 100
D = 20

x = Embedding(V + 1,D)(i)

#First Convolution Layer

x = Conv1D(32,3,activation='relu')(x)
x = MaxPooling1D(3)(x)

#Second Convolution Layer

x = Conv1D(64,3,activation='relu')(x)
x = MaxPooling1D(3)(x)

#Third Convolution Layer

x = Conv1D(128,3,activation='relu')(x)
#x = Flatten()(x)
x = GlobalMaxPooling1D()(x)

#Dense Layer -- Output Layer

x = Dense(1,activation='sigmoid')(x)


model = Model(i,x)


In [79]:
vocab_size = V+1
embeding_dim = 128
max_len = T

# Define the CNN model
model2 = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embeding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])



In [80]:
#Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [81]:
#Train
history = model.fit(trainData,y_train, epochs=5, validation_data=(testData,y_test))

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.8414 - loss: 0.4426 - val_accuracy: 0.9435 - val_loss: 0.1742
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9672 - loss: 0.0992 - val_accuracy: 0.9865 - val_loss: 0.0423
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.9930 - loss: 0.0231 - val_accuracy: 0.9928 - val_loss: 0.0293
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9987 - loss: 0.0067 - val_accuracy: 0.9910 - val_loss: 0.0295
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9996 - loss: 0.0039 - val_accuracy: 0.9910 - val_loss: 0.0287


In [83]:
print(f"Testing score is {model.score(testData,y_test)} and TrainingScore is {model.score(trainData,y_train)} ")

AttributeError: 'Functional' object has no attribute 'score'

In [84]:
#Classification report
from sklearn.metrics import classification_report, confusion_matrix


print(classification_report(y_test,model.predict(testData)))

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [85]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions
y_pred = model.predict(testData)

# Print classification report
print(classification_report(y_test, y_pred))

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [51]:
# ANN Model Approach Code Style

In [52]:


max_len=T #no of cols in the pad_seq of train data
embedding_dim=20 #hyperparameter
vocab_size=len(tokenizer.word_index) #trained tokenizer object word index

model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))


In [53]:
#Custom Callback for early stopping

class MyThresholdCallBack(tf.keras.callbacks.Callback):
    def __init__(self,cl):
        super(MyThresholdCallBack, self).__init__()
        self.cl = cl

    def on_epoch_end(self, epoch, logs=None):
        test_score = logs["val_accuracy"]
        train_score = logs["accuracy"]

        if test_score > train_score and test_score > self.cl:
        #if test_score > self.cl:
            self.model.stop_training = True

In [54]:
myR2ScoreMonitor = MyThresholdCallBack(cl=0.70)

In [55]:
#Compile
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [59]:
#Train
history = model.fit(trainData,y_train, epochs=10000, validation_data=(testData,y_test),callbacks=[myR2ScoreMonitor])

Epoch 1/10000
[1m118/140[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 16ms/step - accuracy: 0.8617 - loss: 0.3997

2024-09-12 12:25:35.137733: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: INVALID_ARGUMENT: indices[7,187] = 7968 is not in [0, 7968)
	 [[{{function_node __inference_one_step_on_data_7453}}{{node sequential_2_1/embedding_7_1/GatherV2}}]]


InvalidArgumentError: Graph execution error:

Detected at node sequential_2_1/embedding_7_1/GatherV2 defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/oysterable/.local/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/oysterable/.local/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 607, in run_forever

  File "/opt/anaconda3/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once

  File "/opt/anaconda3/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/Users/oysterable/.local/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/oysterable/.local/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/cc/_kcvjw313pl6kbf8c3bdtnyc0000gn/T/ipykernel_97624/299157595.py", line 2, in <module>

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/trainer.py", line 51, in train_step

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/models/sequential.py", line 212, in call

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/models/functional.py", line 175, in call

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/ops/function.py", line 171, in _run_through_graph

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/models/functional.py", line 560, in call

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/layers/layer.py", line 901, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/ops/operation.py", line 46, in __call__

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 156, in error_handler

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/layers/core/embedding.py", line 140, in call

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/ops/numpy.py", line 4918, in take

  File "/opt/anaconda3/lib/python3.11/site-packages/keras/src/backend/tensorflow/numpy.py", line 1967, in take

indices[7,187] = 7968 is not in [0, 7968)
	 [[{{node sequential_2_1/embedding_7_1/GatherV2}}]] [Op:__inference_one_step_on_iterator_7502]

In [None]:
#Create Deployment code of User Input in Jupyter Notebook --

### LSTM

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim,max_len))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dense(24, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

Embedding

Layer >> Converts input text data into DENSE VECTORS.

