In [70]:
# import libraries
try:
  # %tensorflow_version only exists in Colab.
  !pip install tf-nightly
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2.14.0-dev20230528


In [71]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

--2023-05-28 16:06:18--  https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 358233 (350K) [text/tab-separated-values]
Saving to: ‘train-data.tsv.1’


2023-05-28 16:06:18 (14.2 MB/s) - ‘train-data.tsv.1’ saved [358233/358233]

--2023-05-28 16:06:18--  https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 118774 (116K) [text/tab-separated-values]
Saving to: ‘valid-data.tsv.1’


2023-05-28 16:06:19 (2.50 MB/s) - ‘valid-data.tsv.1’ saved [118774/118774]



In [72]:
# Load train and test sets using pandas
train_df = pd.read_csv(train_file_path, sep='\t', header=None)
test_df = pd.read_csv(test_file_path, sep='\t', header=None)

train_df.head()

Unnamed: 0,0,1
0,ham,ahhhh...just woken up!had a bad dream about u ...
1,ham,you can never do nothing
2,ham,"now u sound like manky scouse boy steve,like! ..."
3,ham,mum say we wan to go then go... then she can s...
4,ham,never y lei... i v lazy... got wat? dat day ü ...


In [73]:
# Split train and test data into text (features) and labels
X_train, y_train = train_df[1].values, train_df[0].values
X_test, y_test = test_df[1].values, test_df[0].values

In [74]:
# Convert label strings to 0s and 1s
y_train, y_test = np.where(y_train == "ham", 0, 1), np.where(y_test == "ham", 0, 1)


In [75]:
# Tokenize the text data using the Tokenizer class from keras
tokenizer = keras.preprocessing.text.Tokenizer(num_words=10000, lower=True, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [76]:
# Convert text data to sequences of integer indices
X_train_seqs = tokenizer.texts_to_sequences(X_train)
X_test_seqs = tokenizer.texts_to_sequences(X_test)

In [77]:
# Pad sequences to a fixed length of 100
X_train_padded = keras.preprocessing.sequence.pad_sequences(X_train_seqs, maxlen=50, padding='post', truncating='post')
X_test_padded = keras.preprocessing.sequence.pad_sequences(X_test_seqs, maxlen=50, padding='post', truncating='post')

In [78]:
# Create the model
max_words = 10000
max_len = 50

i = tf.keras.layers.Input(shape=[max_len])
x = tf.keras.layers.Embedding(max_words, 50, input_length=max_len)(i)
x = tf.keras.layers.LSTM(64)(x)
x = tf.keras.layers.Dense(256, activation='relu')(x)
x = tf.keras.layers.Dropout(0.5)(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(inputs=i, outputs=x)

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Model Summary
model.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 50)]              0         
                                                                 
 embedding_9 (Embedding)     (None, 50, 50)            500000    
                                                                 
 lstm_7 (LSTM)               (None, 64)                29440     
                                                                 
 dense_15 (Dense)            (None, 256)               16640     
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_16 (Dense)            (None, 1)                 257       
                                                                 
Total params: 546337 (2.08 MB)
Trainable params: 546337 (2.

In [84]:
# Train the model with early stopping
early_stop = keras.callbacks.EarlyStopping(patience=2, restore_best_weights=True)
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=128, validation_data=(X_test_padded, y_test), callbacks=[early_stop])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [80]:
# Make predictions
y_pred_proba = model.predict(X_test_padded)
y_pred = np.where(y_pred_proba > 0.5, 1, 0)



In [81]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print("Test set accuracy:", accuracy)

Test set accuracy: 0.9906609058380127


In [82]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):

    # Preprocess the[pred_text (note: this assumes the tokenizer and padding have already been done on the full dataset)
    message_seq = tokenizer.texts_to_sequences([pred_text])
    message_padded = keras.preprocessing.sequence.pad_sequences(message_seq, maxlen=100, padding='post', truncating='post')
    # Make a prediction
    proba = model.predict(message_padded)[0][0]
    if proba > 0.6:
        label = "spam"
    else:
        label = "ham"
    return [proba, label]


pred_text = "how are you doing today?"

prediction = predict_message(pred_text)
print(prediction)

[0.002128429, 'ham']


In [83]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()


You haven't passed yet. Keep trying.
