# **Importing relevant libraries**


In [None]:
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

## **Load Data**


In [None]:
# 1. Load and preprocess data
df = pd.read_csv("./spam/SMSSpamCollection", sep="\t", names=["type", "message"])
df.head()


In [None]:
df["spam"] = df["type"] == "spam"
df.drop("type", axis=1, inplace=True)

In [None]:
df_train = df.sample(frac=0.8, random_state=0)
df_val = df.drop(index=df_train.index)


## **Vectorize text data**


In [None]:
# 2. Vectorize text data
vectorizer = CountVectorizer(max_features=5000)
messages_train_sparse = vectorizer.fit_transform(df_train['message'])
messages_val_sparse = vectorizer.transform(df_val["message"])

## **Convert Data to TensorFlow tensors**


In [None]:
# 3. Convert data to TensorFlow tensors
X_train = tf.convert_to_tensor(messages_train_sparse.todense(), dtype=tf.float32)
y_train_numpy = df_train["spam"].values.astype(np.float32)
y_train = tf.reshape(tf.convert_to_tensor(y_train_numpy), (-1, 1))

X_val = tf.convert_to_tensor(messages_val_sparse.todense(), dtype=tf.float32)
y_val_numpy = df_val["spam"].values.astype(np.float32)
y_val = tf.reshape(tf.convert_to_tensor(y_val_numpy), (-1, 1))


## **Define the model**


In [None]:
# 4. Define the model, loss function, and optimizer using TensorFlow Keras
model = tf.keras.Sequential([
    tf.keras.layers.Dense(1, input_shape=(5000,))
])

loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

## **Training Loop**


In [None]:
# 5. Training loop
epochs = 15000
for i in range(epochs):
    with tf.GradientTape() as tape:
        outputs = model(X_train, training=True)
        loss = loss_fn(y_train, outputs)

    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    if i % 1000 == 0:
        print(f"Epoch {i}, Loss: {loss.numpy()}")



## **Evaluate the model**


In [None]:
# 6. Evaluate the model
y_pred_sigmoid_train = tf.nn.sigmoid(model(X_train, training=False))
print("\nSigmoid outputs on training data:")
print(y_pred_sigmoid_train)
print(f"Min sigmoid output: {tf.reduce_min(y_pred_sigmoid_train).numpy()}")
print(f"Max sigmoid output: {tf.reduce_max(y_pred_sigmoid_train).numpy()}")

## **Define evaluation function**


In [None]:
# 7. Define evaluation function
def evaluate_model_tf(X, y, threshold=0.25):
    y_logits = model(X, training=False)
    y_prob = tf.nn.sigmoid(y_logits)
    y_pred_bool = y_prob > threshold

    y_true_bool = tf.cast(y, tf.bool)

    accuracy = tf.reduce_mean(tf.cast(tf.equal(y_pred_bool, y_true_bool), tf.float32))
    print(f"accuracy: {accuracy.numpy()}")

    actual_positives_mask = tf.equal(y_true_bool, True)
    if tf.reduce_sum(tf.cast(actual_positives_mask, tf.float32)).numpy() == 0:
        sensitivity = float('nan')
    else:
        predicted_for_actual_positives = tf.boolean_mask(y_pred_bool, actual_positives_mask)
        sensitivity = tf.reduce_mean(tf.cast(predicted_for_actual_positives, tf.float32))
    print(f"sensitivity: {sensitivity.numpy() if isinstance(sensitivity, tf.Tensor) else sensitivity}")

    actual_negatives_mask = tf.equal(y_true_bool, False)
    if tf.reduce_sum(tf.cast(actual_negatives_mask, tf.float32)).numpy() == 0:
        specificity = float('nan')
    else:
        predicted_for_actual_negatives = tf.boolean_mask(y_pred_bool, actual_negatives_mask)
        specificity = tf.reduce_mean(tf.cast(tf.logical_not(predicted_for_actual_negatives), tf.float32))
    print(f"specificity: {specificity.numpy() if isinstance(specificity, tf.Tensor) else specificity}")

    predicted_positives_mask = tf.equal(y_pred_bool, True)
    if tf.reduce_sum(tf.cast(predicted_positives_mask, tf.float32)).numpy() == 0:
        precision = float('nan')
    else:
        actuals_for_predicted_positives = tf.boolean_mask(y_true_bool, predicted_positives_mask)
        precision = tf.reduce_mean(tf.cast(actuals_for_predicted_positives, tf.float32))
    print(f"precision: {precision.numpy() if isinstance(precision, tf.Tensor) else precision}")

print("\nEvaluating on the training data")
evaluate_model_tf(X_train, y_train)

print("\nEvaluating on the validation data")
evaluate_model_tf(X_val, y_val)



## **Prediction**


In [None]:
# 8. Predict on custom messages
custom_messages_text = [
    "Winner! Great deal, call us to get this product for free",
    "Tomorrow is my birthday, do you come to the party?"
]
custom_messages_sparse = vectorizer.transform(custom_messages_text)
X_custom = tf.convert_to_tensor(custom_messages_sparse.todense(), dtype=tf.float32)

custom_preds_logits = model(X_custom, training=False)
custom_preds_sigmoid = tf.nn.sigmoid(custom_preds_logits)

print("\nPredictions for custom messages:")
for i, text in enumerate(custom_messages_text):
    print(f"Message: \"{text}\"")
    print(f"Spam probability (TensorFlow): {custom_preds_sigmoid[i].numpy()[0]:.4f}")
    print(f"Predicted as Spam (threshold 0.25): {custom_preds_sigmoid[i].numpy()[0] > 0.25}")
    print("-" * 20)