In [1]:
# import libraries
try:
  # Uninstall tf-nightly and install a stable TensorFlow version
  #!pip uninstall -y tf-nightly tensorflow
  !pip install tensorflow
except Exception:
  pass
import tensorflow as tf
import pandas as pd
from tensorflow import keras
!pip install tensorflow-datasets
import tensorflow_datasets as tfds
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [3]:
from tensorflow.keras import layers

In [4]:
import pandas as pd

train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'message'])
test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'message'])
print(train_df.head())

In [5]:
# Extract messages
train_messages = train_df['message'].values
test_messages = test_df['message'].values

# Map 'ham' to 0 and 'spam' to 1 for labels
label_mapping = {'ham': 0, 'spam': 1}
train_labels = np.array([label_mapping[label] for label in train_df['label']])
test_labels = np.array([label_mapping[label] for label in test_df['label']])

#print(train_labels)

In [6]:
#Remove HTML
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  return tf.strings.regex_replace(stripped_html,
                                  '[%s]' % re.escape(string.punctuation),
                                  '')
  clean_data  = custom_standardization(train_messages)

In [7]:
#Vectorization
max_features = 10000
sequence_length = 120

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)



In [8]:
import re
import string
vectorize_layer.adapt(train_messages)

In [9]:
# Transform messages to sequences
train_sequences = vectorize_layer(train_messages)
test_sequences = vectorize_layer(test_messages)

print(f"\nVectorization example:")
print(f"Original: {train_messages[0]}")
print(f"Vectorized: {train_sequences[0].numpy()[:20]}...")

In [10]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(1, activation='sigmoid')
])

model.summary()

In [11]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [30]:
print("\n" + "="*50)
print("Training the model...")
print("="*50)

history = model.fit(
    train_sequences,
    train_labels,
    epochs=60,
    validation_data=(test_sequences, test_labels),
    verbose=1
)

In [31]:
# Right after training
test_msg = "you have won £1000 cash! call to claim your prize."
vectorized = vectorize_layer([test_msg])
prediction = model.predict(vectorized, verbose=0)[0][0]
print(f"Direct prediction after training: {prediction}")
print(f"Label: {'spam' if prediction > 0.5 else 'ham'}")

# Check vocabulary
print(f"Vocabulary size: {len(vectorize_layer.get_vocabulary())}")
print(f"First 20 words: {vectorize_layer.get_vocabulary()[:20]}")

In [32]:
#Evaluate model
print("Training set:")
print(f"Ham: {sum(train_labels == 0)}")
print(f"Spam: {sum(train_labels == 1)}")
print(f"Spam ratio: {sum(train_labels == 1) / len(train_labels):.2%}")

print("\n" + "="*50)
print("Evaluating on test data...")
print("="*50)

loss, accuracy = model.evaluate(test_sequences, test_labels)
print(f"\nTest Accuracy: {accuracy:.4f}")
print(f"Test Loss: {loss:.4f}")

In [33]:
#Save model
model.save('spam_model.keras')
print("\nModel saved as 'spam_model.keras'")

In [34]:
# function to predict messages based on model
# (should return list containing prediction and label, ex. [0.008318834938108921, 'ham'])
def predict_message(pred_text):
  # Vectorize the input text
  vectorized = vectorize_layer([pred_text])

  # Make prediction
  prediction = model.predict(vectorized, verbose=0)[0][0]

  # Convert to label
  label = 'spam' if prediction > 0.5 else 'ham'


  return [float(prediction), label] # Changed to return a single list

pred_text = "Hi ,how are you today "

prediction = predict_message(pred_text)
print(prediction)

In [35]:
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]
  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True

  for msg, ans in zip(test_messages, test_answers):
    prediction = predict_message(msg)
    print(f"Message: {msg[:40]}...")
    print(f"  Predicted: {prediction[1]} (prob: {prediction[0]:.4f})")
    print(f"  Expected: {ans}")
    print(f"  Match: {prediction[1] == ans}\n")
    if prediction[1] != ans:
      passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()