In [None]:
# library for: dataframe manipulation
import numpy as np
import pandas as pd

# library for: text preprocessing
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# library for visualizationnltk.download('punkt')
import matplotlib.pyplot as plt

# library for: modelling
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# library for: metrics
from sklearn.metrics import accuracy_score

# library for: save model
import pickle

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
# Load Data
df = pd.read_csv("../datasets/sms_spam_collection.csv")
df

In [None]:
# cek distribusi label
df["label"].value_counts(normalize=True)

In [None]:
# convert label to numeric
df["label"] = np.where(df["label"]=="spam", 1, 0)

In [None]:
# cek distribusi label
df["label"].value_counts(normalize=True)

# text preprocessing

In [None]:
# text preprocessing : lowercase

df["message"] = df["message"].apply(lambda x: x.lower())
df["message"]

In [None]:
# text preprocessing : remove non alphanumeric

df["message"] = df["message"].apply(lambda x: re.sub(r"[^A-Za-z0-9]", " ", x))
df["message"]

In [None]:
# text preprocessing : remove escape (backslash)

df["message"] = df["message"].apply(lambda x: re.sub(r"\'s", " ", x))
df["message"]

In [None]:
# text preprocessing : remove URL

df["message"] = df["message"].apply(lambda x: re.sub(r"http\S+", " link ", x))
df["message"]

In [None]:
# text preprocessing : remove numbers

df["message"] = df["message"].apply(lambda x: re.sub(r"\b\d+(?:\.\d+)?\b", "", x))
df["message"]

In [None]:
# text preprocessing : remove punctuation

df["message"] = df["message"].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df["message"]

In [None]:
# text preprocessing : remove extra whitespace
df["message"] = df["message"].apply(lambda x: " ".join(x.split()))
df["message"]

In [None]:
# text preprocessing : remove stopwords

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

df["message"] = df["message"].apply(remove_stopwords)
df["message"]

# data splitting

In [None]:
# split data to train and test

x_train, x_test, y_train, y_test = train_test_split(
    df["message"],
    df["label"],
    stratify=df["label"],
    test_size=0.2,
    shuffle=True,
    random_state=42
)

# tokenization

In [None]:
# define tokenizer
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")

# fit on texts
tokenizer.fit_on_texts(x_train)
tokenizer.fit_on_texts(x_test)

# tokenize
train_sequence = tokenizer.texts_to_sequences(x_train)
test_sequence = tokenizer.texts_to_sequences(x_test)

In [None]:
# padding
train_padded = pad_sequences(train_sequence, maxlen=20)
test_padded = pad_sequences(test_sequence, maxlen=20)

# modelling

In [None]:
# define model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(250, 16, input_length=20),
    tf.keras.layers.LSTM(16),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# define optimizer and loss and metrics
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [None]:
# use callback to early stopping

class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.98):
      print("\nAkurasi telah mencapai > 98%!")
      self.model.stop_training = True
      
callbacks = myCallback()

## training

In [None]:
# training
history = model.fit(
    train_padded,
    y_train,
    epochs=30,
    validation_data=(test_padded, y_test),
    callbacks=[callbacks]
)

## evaluate model

In [None]:
# Loss of Training and Testing

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train Loss', "Test Loss"], loc='upper right')
plt.show()

In [None]:
# accuracy training and testing

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train Acc', "Test Acc"], loc='lower right')
plt.show()

## predict new data

In [None]:
y_test = y_test.to_numpy().reshape(-1, 1)
y_test_pred = np.where(model.predict(test_padded)>0.5, 1, 0)

In [None]:
accuracy_score(y_test_pred, y_test)

# save model

In [None]:
# save tokenizer
with open("../app/models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# save model
model.save("../app/models/sms_spam_prediction.keras")