# Install dependencies

In [1]:
!kaggle datasets download -d team-ai/spam-text-message-classification

Dataset URL: https://www.kaggle.com/datasets/team-ai/spam-text-message-classification
License(s): CC0-1.0
Downloading spam-text-message-classification.zip to /home/VLAI/minhth/ViCLEVR-X/notebooks
100%|█████████████████████████████████████████| 208k/208k [00:00<00:00, 368kB/s]
100%|█████████████████████████████████████████| 208k/208k [00:00<00:00, 367kB/s]


In [2]:
!unzip ./spam-text-message-classification.zip

Archive:  ./spam-text-message-classification.zip
  inflating: SPAM text message 20170820 - Data.csv  


In [3]:
ls

 random_word_baseline.ipynb               Text_Classification.ipynb
'SPAM text message 20170820 - Data.csv'   [0m[01;32mvisualize.ipynb[0m*
 [01;31mspam-text-message-classification.zip[0m     vqa_baseline.ipynb
 test_ggtrans.ipynb                       vqax_baseline.ipynb


In [6]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m978.0 kB/s[0m eta [36m0:00:00[0mMB/s[0m eta [36m0:00:01[0m
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[?25hDownloading scipy-1.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

# Import important libraries

In [10]:
import os
import pandas as pd

data_path = './SPAM text message 20170820 - Data.csv'
data = pd.read_csv(data_path)
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

[nltk_data] Downloading package stopwords to /home/minhth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/minhth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
message_list = data['Message'].values.tolist()
label_list = data['Category'].values.tolist()

In [13]:
len(message_list), len(label_list)

(5572, 5572)

# Preprocessing

In [23]:
def lowercase(text: str):
    return text.lower()

def punctuation_removal(text: str):
    translator = str.maketrans('', '', string.punctuation)
    
    return text.translate(translator)

def tokenize(text):
    return nltk.word_tokenize(text)

def remove_stopwords(tokens):
    stop_words = nltk.corpus.stopwords.words('english')
    
    return [token for token in tokens if token not in stop_words]

def stemming(tokens):
    stemmer = nltk.PorterStemmer()
    
    return [stemmer.stem(token) for token in tokens]

def preprocess_text(text):
    text = lowercase(text)
    text = punctuation_removal(text)
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    
    return tokens

messages = [preprocess_text(message) for message in message_list]

In [26]:
def create_dictionary(messages):
    dictionary = []
    
    for tokens in messages:
        for token in tokens:
            if token not in dictionary:
                dictionary.append(token)
    
    return dictionary

In [27]:
dictionary = create_dictionary(messages)

In [28]:
def create_features(tokens, dictionary):
    features = np.zeros(len(dictionary))
    for token in tokens:
        if token in dictionary:
            features[dictionary.index(token)] += 1
    return features

In [29]:
X = np.array([create_features(tokens, dictionary) for tokens in messages])

In [31]:
le = LabelEncoder()
y = le.fit_transform(label_list)
print(f'Classes: {le.classes_}')
print(f'Encoded classes: {y}')

Classes: ['ham' 'spam']
Encoded classes: [0 0 1 ... 0 0 0]


In [33]:
VAL_SIZE = 0.2
TEST_SIZE = 0.125
SEED= 0

X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                 test_size = VAL_SIZE,
                                                 shuffle=True,
                                                 random_state=SEED)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train,
                                                   test_size=TEST_SIZE,
                                                   shuffle=True,
                                                   random_state=SEED)

In [34]:
model = GaussianNB()

In [35]:
print('Start training...')
model = model.fit(X_train, y_train)
print('Training completed')

Start training...
Training completed


In [36]:
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

val_acc = accuracy_score(y_val, y_val_pred)
test_acc = accuracy_score(y_test, y_test_pred)

In [37]:
print(val_acc, test_acc)

0.8816143497757848 0.8602150537634409


In [38]:
def predict(text, model, dictionary):
    processed_text = preprocess_text(text)
    features = create_features(text, dictionary)
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    prediction_cls = le.inverse_transform(prediction)[0]
    
    return prediction_cls

test_input = 'I am actually thinking a way of doing something useful'
prediction_cls = predict(test_input, model, dictionary)
print(f'Prediction: {prediction_cls}')

Prediction: ham
