Importing the dataset

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Path to dataset files: /kaggle/input/sms-spam-collection-dataset


**Reading the dataset**

In [2]:
import pandas as pd

data = pd.read_csv(f"{path}/spam.csv", encoding='latin-1')
data.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


** Removing special characters, converting it to lowercase, and tokenizing.**

In [4]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')  # 👈 add this line


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
import re
from nltk.tokenize import word_tokenize

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove symbols/numbers
    text = text.lower()                      # lowercase
    tokens = word_tokenize(text)             # split into words
    return tokens

data['cleaned_text'] = data['v2'].apply(clean_text)
data[['v2','cleaned_text']].head()


Unnamed: 0,v2,cleaned_text
0,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


**Convert the cleaned text into a numerical format using TF-IDF.**

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join tokens back into a single string because TF-IDF needs text, not lists
data['cleaned_text_str'] = data['cleaned_text'].apply(lambda x: ' '.join(x))

# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # limit to top 5000 important words

# Fit and transform the cleaned text
X = tfidf.fit_transform(data['cleaned_text_str']).toarray()

print("TF-IDF shape:", X.shape)
print("Sample vector:", X[0])


TF-IDF shape: (5572, 5000)
Sample vector: [0. 0. 0. ... 0. 0. 0.]


**We can experiment with various machine learning algorithms, such as Naive Bayes, Support Vector Machines, and more **

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare data
data['cleaned_text_str'] = data['cleaned_text'].apply(lambda x: ' '.join(x))
X = data['cleaned_text_str']
y = (data['v1'].str.lower() == 'spam').astype(int)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF + Model
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

# Train Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
preds = nb.predict(X_test_tfidf)

# Evaluate
print("🔹Naive Bayes Results🔹")
print("Accuracy:", accuracy_score(y_test, preds))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))


🔹Naive Bayes Results🔹
Accuracy: 0.9650224215246637
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       0.99      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115

[[965   1]
 [ 38 111]]


In [8]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Prepare data
data['cleaned_text_str'] = data['cleaned_text'].apply(lambda x: ' '.join(x))
X = data['cleaned_text_str']
y = (data['v1'].str.lower() == 'spam').astype(int)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

# Train SVM
svm_model = LinearSVC(C=1.0)
svm_model.fit(X_train_tfidf, y_train)
svm_preds = svm_model.predict(X_test_tfidf)

# Evaluate
print("🔹SVM Results🔹")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))
print(confusion_matrix(y_test, svm_preds))


🔹SVM Results🔹
Accuracy: 0.9838565022421525
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       0.99      0.89      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[965   1]
 [ 17 132]]


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Prepare data
data['cleaned_text_str'] = data['cleaned_text'].apply(lambda x: ' '.join(x))
X = data['cleaned_text_str']
y = (data['v1'].str.lower() == 'spam').astype(int)

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf  = tfidf.transform(X_test)

# Train Logistic Regression
log_model = LogisticRegression(max_iter=200, C=2.0)
log_model.fit(X_train_tfidf, y_train)
log_preds = log_model.predict(X_test_tfidf)

# Evaluate
print("🔹Logistic Regression Results🔹")
print("Accuracy:", accuracy_score(y_test, log_preds))
print(classification_report(y_test, log_preds))
print(confusion_matrix(y_test, log_preds))


🔹Logistic Regression Results🔹
Accuracy: 0.9713004484304932
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       0.99      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[[965   1]
 [ 31 118]]


**Exploring advanced techniques in NLP**

In [10]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example sentence
sentence = "Win a free iPhone now!"

# Turn text into tokens BERT understands
inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=64)

# Pass through BERT to get features
with torch.no_grad():
    outputs = model(**inputs)

# Get sentence vector (mean of all word embeddings)
sentence_embedding = outputs.last_hidden_state.mean(dim=1)

print(sentence_embedding.shape)  # torch.Size([1, 768])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([1, 768])


**Improving and finalizing your model**

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = vectorizer.fit_transform(data['cleaned_text_str'])
y = (data['v1'].str.lower() == 'spam').astype(int)

# Model
model = MultinomialNB()

# 5-fold cross validation
scores = cross_val_score(model, X_tfidf, y, cv=5, scoring='f1')

print("Cross-Validation F1 Scores:", scores)
print("Average F1 Score:", scores.mean())


Cross-Validation F1 Scores: [0.90909091 0.85171103 0.84942085 0.89219331 0.88148148]
Average F1 Score: 0.876779515031879


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC

svm = LinearSVC()
params = {'C': [0.1, 0.5, 1.0, 2.0, 5.0]}

grid = GridSearchCV(svm, param_grid=params, scoring='f1', cv=5)
grid.fit(X_tfidf, y)

print("Best C value:", grid.best_params_)
print("Best F1 Score:", grid.best_score_)


Best C value: {'C': 2.0}
Best F1 Score: 0.9172957629841519


In [13]:
from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [0.1, 0.3, 0.5, 1.0]}
grid = GridSearchCV(MultinomialNB(), param_grid=params, scoring='f1', cv=5)
grid.fit(X_tfidf, y)

print("Best alpha:", grid.best_params_)


Best alpha: {'alpha': 0.3}
