In [23]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import spacy
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM, Embedding, Dropout
from tensorflow.keras.utils import to_categorical

In [15]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
df = pd.read_csv("/content/drive/MyDrive/Datasets/SPAM text message 20170820 - Data.csv")
print(df.info(),'\n',df.describe())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None 
        Category                 Message
count      5572                    5572
unique        2                    5157
top         ham  Sorry, I'll call later
freq       4825                      30


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
df['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [19]:
nlp = spacy.load("en_core_web_lg")

def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_punct or token.is_stop:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [20]:
df['text'] = df['Message'].apply(preprocess)
df.head()

Unnamed: 0,Category,Message,text
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win FA Cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,U dun early hor U c
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live


In [21]:
df['vector'] = df['text'].apply(lambda x: nlp(x).vector)
df.head()

Unnamed: 0,Category,Message,text,vector
0,ham,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...,"[1.1192545, 0.980326, 0.26543233, -0.8769394, ..."
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni,"[-0.14939333, 1.0167166, 0.4778967, -1.6510634..."
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win FA Cup final tkts 2...,"[-0.2184723, -2.4377646, 1.8605095, 0.9640945,..."
3,ham,U dun say so early hor... U c already then say...,U dun early hor U c,"[-3.6434948, 3.2535734, 4.5511975, -1.37153, 3..."
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live,"[0.04441598, 3.1517997, -3.422078, -0.27837402..."


In [26]:
df["spam"] = df['Category'].apply(lambda x: 1 if x == "spam" else 0)

In [27]:
data = df
X = np.array(data["vector"].tolist())
y = data["spam"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = np.expand_dims(X_train, axis=1)
X_test = np.expand_dims(X_test, axis=1)

def build_and_evaluate_model(model_type="rnn"):
    model = Sequential()
    if model_type == "rnn":
        model.add(SimpleRNN(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
    elif model_type == "lstm":
        model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
    model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)

    y_pred = model.predict(X_test).flatten()
    y_pred_class = (y_pred > 0.5).astype(int)

    f1 = classification_report(y_test, y_pred_class, output_dict=True)['1']['f1-score']
    roc_auc = roc_auc_score(y_test, y_pred)

    return f1, roc_auc

rnn_f1, rnn_roc_auc = build_and_evaluate_model(model_type="rnn")

lstm_f1, lstm_roc_auc = build_and_evaluate_model(model_type="lstm")

print("RNN - F1 Score: {:.4f}, ROC-AUC: {:.4f}".format(rnn_f1, rnn_roc_auc))
print("LSTM - F1 Score: {:.4f}, ROC-AUC: {:.4f}".format(lstm_f1, lstm_roc_auc))


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step
RNN - F1 Score: 0.8789, ROC-AUC: 0.9823
LSTM - F1 Score: 0.9306, ROC-AUC: 0.9872
