In [1]:
import string
import nltk
import joblib

nltk.download("stopwords")
nltk.download("punkt")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hongong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hongong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
DATASET_PATH = "data/2cls_spam_text_cls.csv"
df = pd.read_csv(DATASET_PATH)
print(df.shape)
df.head()

(5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
messages = df["Message"].values.tolist()
labels = df["Category"].values.tolist()

# Data preprocessing

In [7]:
def lowercase(text):
    return text.lower()


def punctuation_removal(text):
    translator = str.maketrans("", "", string.punctuation)

    return text.translate(translator)


def tokenize(text):
    return nltk.word_tokenize(text)


def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words("english")

    return [token for token in tokens if token not in stop_words]


def stemming(tokens):
    stemmer = nltk.PorterStemmer()

    return [stemmer.stem(token) for token in tokens]


def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)

    return tokens

In [9]:
messages = [preprocess_text(message) for message in messages]
print(messages[:5])

[['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat'], ['ok', 'lar', 'joke', 'wif', 'u', 'oni'], ['free', 'entri', '2', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', '21st', 'may', '2005', 'text', 'fa', '87121', 'receiv', 'entri', 'questionstd', 'txt', 'ratetc', 'appli', '08452810075over18'], ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'], ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though']]


In [9]:
def create_dictionary(messages):
    dictionary = []

    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)

    return dictionary


def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))

    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1

    return features

In [18]:
dictionary = create_dictionary(messages)
X = np.array([create_features(tokens, dictionary) for tokens in messages])
print(X.shape)
print(X[:5])

(5572, 8166)
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [19]:
le = LabelEncoder()
y = le.fit_transform(labels)
print(f"Classes: {le.classes_}")
print(f"Encoded labels: {y}")

Classes: ['ham' 'spam']
Encoded labels: [0 0 1 ... 0 0 0]


# Train, val, test

In [20]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED = 0

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=VAL_SIZE, shuffle=True, random_state=SEED
)
X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train, test_size=TEST_SIZE, shuffle=True, random_state=SEED
)

In [21]:
%%time
model = GaussianNB()
print("Start training...")
model = model.fit(X_train, y_train)
print("Training completed!")

Start training...
Training completed!
CPU times: user 118 ms, sys: 92.2 ms, total: 211 ms
Wall time: 238 ms


In [22]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)
val_accuracy = accuracy_score(y_val, y_val_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Val accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")

Val accuracy: 0.8816143497757848
Test accuracy: 0.8602150537634409


In [5]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]

    return prediction_cls

In [24]:
test_input = "I am actually thinking a way of doing something useful"
prediction_cls = predict(test_input, model, dictionary)
print(f"Prediction: {prediction_cls}")

Prediction: ham


In [3]:
MODEL_PATH = "models/spam_classifier.joblib"
DICTIONARY_PATH = "models/dictionary.joblib"
LE_PATH = "models/label_encoder.joblib"

In [26]:
# save the model
joblib.dump(model, MODEL_PATH)

# save the dictionary
joblib.dump(dictionary, DICTIONARY_PATH)

# save the label encoder
joblib.dump(le, LE_PATH)

['models/label_encoder.joblib']

In [10]:
# load the model
model = joblib.load(MODEL_PATH)
dictionary = joblib.load(DICTIONARY_PATH)
le = joblib.load(LE_PATH)

test_input = "I am actually thinking a way of doing something useful"
prediction_cls = predict(test_input, model, dictionary)
print(f"Prediction: {prediction_cls}")

Prediction: ham
