# Spam Detection with Keras
In this example, we will write a Keras model to classify messages as Spam or Ham.

In [4]:
import sys, os, random, pathlib, io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

%matplotlib inline

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED);

print(f"Using Tensorflow {tf.__version__}")

Using Tensorflow 2.11.0


## Download the dataset
We will use the `SMS Spam Collection` dataset, available at UCI archives at the following URL `https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip`. We will use `tf.keras.utils.get_file(...)` to download the dataset to the `./data` folder.

In [5]:
DATASET_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
DEST_FOLDER = pathlib.Path(os.getcwd()) / "data" / "spam"

In [None]:
DEST_FOLDER.mkdir(exist_ok=True)
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip", origin=DATASET_URL, extract=True)
print(f"Dataset downloaded to {path_to_zip}")

!unzip $path_to_zip -d $DEST_FOLDER

Dataset downloaded to /home/mjbhobe/.keras/datasets/smsspamcollection.zip
Archive:  /home/mjbhobe/.keras/datasets/smsspamcollection.zip
replace /home/mjbhobe/code/git-projects/dl-keras/data/spam/SMSSpamCollection? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [None]:
# check if we can read the data file
lines = io.open(str(DEST_FOLDER / "SMSSpamCollection")).read().strip().split("\n")
lines[:5]

['ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham\tOk lar... Joking wif u oni...',
 "spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham\tU dun say so early hor... U c already then say...',
 "ham\tNah I don't think he goes to usf, he lives around here though"]

Notice that the label &amp; message itself are separated by a `\t` character on each line. 
Now let us read in all the lines into a `list`.

In [None]:
spam_dataset = []

for line in lines:
    label, text = line.split("\t")
    if label.strip() == "spam":
        spam_dataset.append((1, text.strip()))
    else:
        spam_dataset.append((0, text.strip()))
print(spam_dataset[:5])

[(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'), (0, 'Ok lar... Joking wif u oni...'), (1, "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"), (0, 'U dun say so early hor... U c already then say...'), (0, "Nah I don't think he goes to usf, he lives around here though")]


In [None]:
df = pd.DataFrame(spam_dataset, columns=["Spam", "Message"])
df.head()

Unnamed: 0,Spam,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import re


def message_length(x):
    return len(x)


def num_capitals(x):
    """count # of capital letters in x (assuming x is an English sentence)"""
    _, count = re.subn(r"[A-Z]", "", x)
    return count


def num_punctuation(x):
    """count # of punctuations in x"""
    _, count = re.subn(r"\W", "", x)
    return count

In [None]:
# apply these functions to the Dataframe
df["Length"] = df["Message"].apply(message_length)
df["Capitals"] = df["Message"].apply(num_capitals)
df["Punctuations"] = df["Message"].apply(num_punctuation)

In [None]:
df.describe()  # display stats of all numeric cols

Unnamed: 0,Spam,Length,Capitals,Punctuations
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,80.443488,5.621636,18.942591
std,0.340699,59.841746,11.683233,14.825994
min,0.0,2.0,0.0,0.0
25%,0.0,36.0,1.0,8.0
50%,0.0,61.0,2.0,15.0
75%,0.0,122.0,4.0,27.0
max,1.0,910.0,129.0,253.0


## Build a model to classify sentiment based on the above info
Let's build a model to classify sentiment based on the `length`, `num capitals` and `num punctuations` fields that we just added.

In [None]:
df2 = df[["Spam", "Length", "Capitals", "Punctuations"]]

# split into train/test sets
train = df2.sample(frac=0.80, random_state=SEED)
test = df2.drop(train.index)

X_train = train[["Length", "Capitals", "Punctuations"]]
y_train = train[["Spam"]]

X_test = test[["Length", "Capitals", "Punctuations"]]
y_test = test[["Spam"]]

print(
    f"X_train.shape: {X_train.shape} - y_train.shape: {y_train.shape} - "
    f"X_test.shape: {X_test.shape} - y_test.shape: {y_test.shape}"
)

X_train.shape: (4459, 3) - y_train.shape: (4459, 1) - X_test.shape: (1115, 3) - y_test.shape: (1115, 1)


In [None]:
def make_model(input_dim=3, num_units=12):
    model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Dense(num_units, input_dim=input_dim, activation="relu"),
            tf.keras.layers.Dense(1, activation="sigmoid"),
        ]
    )

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [None]:
model = make_model()
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                48        
                                                                 
 dense_1 (Dense)             (None, 1)                 13        
                                                                 
Total params: 61
Trainable params: 61
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
hist = model.fit(X_train, y_train, epochs=15, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
np.bincount(y_test.values.ravel()), len(X_test)

(array([960, 155]), 1115)

In [None]:
model.evaluate(X_test, y_test)



[0.20948441326618195, 0.9345291256904602]

In [None]:
y_pred = (model.predict(X_test).ravel() >= 0.5).astype(np.int32)
y_pred[:10]



array([0, 0, 0, 1, 0, 1, 1, 0, 0, 0], dtype=int32)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
print(cm)