In [1]:
# import libraries
import tensorflow as tf
import pandas as pd
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.15.0


In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2024-05-23 21:41:26--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv’


2024-05-23 21:41:27 (10.1 MB/s) - ‘train-data.tsv’ saved [358233/358233]

--2024-05-23 21:41:27--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv’


2024-05-23 21:41:27 (5.18 MB/s) - ‘valid-data.tsv’ saved [118774/118774]



# Reading data

In [3]:
# Reading the dataset
df_train = pd.read_csv("train-data.tsv", sep = '\t', header=None, names = ['label', 'text'])
df_valid = pd.read_csv("valid-data.tsv", sep = '\t', header=None, names = ['label', 'text'])

print(df_train.head())

# Transforming 'ham' to 0 and 'spam' to 1
df_train['label'] = pd.Categorical(df_train['label']).codes
df_valid['label'] = pd.Categorical(df_valid['label']).codes

print(df_train.head())

  label                                               text
0   ham  ahhhh...just woken up!had a bad dream about u ...
1   ham                           you can never do nothing
2   ham  now u sound like manky scouse boy steve,like! ...
3   ham  mum say we wan to go then go... then she can s...
4   ham  never y lei... i v lazy... got wat? dat day ü ...
   label                                               text
0      0  ahhhh...just woken up!had a bad dream about u ...
1      0                           you can never do nothing
2      0  now u sound like manky scouse boy steve,like! ...
3      0  mum say we wan to go then go... then she can s...
4      0  never y lei... i v lazy... got wat? dat day ü ...


# Pre processing the data

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [5]:
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)

    # Remove non-alphanumeric tokens and stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    cleaned_text = ' '.join(tokens)
    return cleaned_text

df_train['text'] = df_train['text'].apply(clean_text)
df_valid['text'] = df_valid['text'].apply(clean_text)

print(df_train.head())

   label                                               text
0      0  ahhhh woken bad dream u tho dont like u right ...
1      0                                      never nothing
2      0  u sound like manky scouse boy steve like trave...
3      0  mum say wan go go shun bian watch da glass exh...
4      0  never lei v lazy got wat dat day ü send da url...


In [6]:
# Tokenize only using training data
max_words = 1000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train['text'])

# Transform the text to sequences
sequences_train = tokenizer.texts_to_sequences(df_train['text'])
sequences_val = tokenizer.texts_to_sequences(df_valid['text'])

# Find the maximum length of sequences in the training data for padding
max_length = 500

# Pad sequences to ensure they are of the same length
X_train = pad_sequences(sequences_train, maxlen=max_length)
X_val = pad_sequences(sequences_val, maxlen=max_length)

# Convert labels to numpy arrays
y_train = np.array(df_train['label'])
y_val = np.array(df_valid['label'])

# Creating the model

In [19]:
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

input_layer= x = Input(max_length)
x = Embedding(max_words, 125)(x)
x = LSTM(128, dropout = 0.5)(x)

x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x)

x = Dense(2, activation='softmax')(x)

model = tf.keras.models.Model(inputs=input_layer, outputs=x)
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 500, 125)          125000    
                                                                 
 lstm_2 (LSTM)               (None, 128)               130048    
                                                                 
 dense_4 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_2 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 2)                 130       
                                                           

In [22]:
from tensorflow.keras.optimizers import SGD, AdamW, Adam

early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)
model.compile(optimizer = Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(
    x = X_train,
    y = y_train,
    epochs = 10,
    batch_size = 128,
    validation_data = (X_val, y_val),
    shuffle = True,
    callbacks = [early_stopping],
    class_weight = {0: 1.5, 1: 1.0}
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7eb2a1cba260>

# Prediction

In [43]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])

ham_spam = {0: "ham", 1: "spam"}

def predict_message(pred_text):
    cleaned_text = clean_text(pred_text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    x = pad_sequences(sequence, maxlen=max_length)

    prediction = model.predict(x)

    # Selecting from which class the result belong according to the index with the higher probability
    result = np.argmax(prediction[0])

    confidence = prediction[0][result]

    return ([confidence, ham_spam[result]])

pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0.97976935, 'ham']


In [44]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:

        print(f"pred: {prediction[1]} - answer: {ans}\n")

        passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You passed the challenge. Great job!
