In [11]:
import sys
import numpy as np
import pandas as pd
import sklearn

print("Python:", sys.version)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)
print("Sklearn:", sklearn.__version__)
print("✅ Environment + notebook OK")


Python: 3.12.3 (main, Nov  6 2025, 13:44:16) [GCC 13.3.0]
NumPy: 2.4.0
Pandas: 2.3.3
Sklearn: 1.8.0
✅ Environment + notebook OK


In [10]:
import pandas as pd

In [12]:
path = "../data/raw/SMSSpamCollection"

df = pd.read_csv(
    path,
    sep="\t",
    header=None,
    names=["label","text"]
)
df.head()


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [13]:
df.info ()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   object
 1   text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [14]:
df["label"].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [15]:
df["text_length"] = df["text"].str.len()

df.groupby("label")["text_length"].agg(
    count="count",
    mean="mean",
    min="min",
    max="max"
)

Unnamed: 0_level_0,count,mean,min,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ham,4825,71.482487,2,910
spam,747,138.670683,13,223


In [9]:
df["text_length"] = df["text"].str.len()

df.sort_values("text_length").groupby("label").head(1)[
    ["label", "text_length", "text"]
]

Unnamed: 0,label,text_length,text
4498,ham,2,Ok
3742,spam,13,2/2 146tf150p


In [7]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]","", text)
    text = re.sub(r"\s+"," ", text).strip()
    return text 

In [16]:
df["clean_text"] = df["text"].apply(clean_text)

df[["text", "clean_text"]].head()


Unnamed: 0,text,clean_text
0,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=10)

X_bow = vectorizer.fit_transform(df["clean_text"])

print("Vocabulary:", vectorizer.get_feature_names_out())
print("Matrix shape:", X_bow.shape)


Vocabulary: ['and' 'for' 'in' 'is' 'me' 'my' 'the' 'to' 'you' 'your']
Matrix shape: (5572, 10)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10)

X_tfidf = tfidf.fit_transform(df["clean_text"])

print("TF-IDF Vocabulary:", tfidf.get_feature_names_out())
print("TF-IDF Matrix shape:", X_tfidf.shape)

TF-IDF Vocabulary: ['and' 'for' 'in' 'is' 'me' 'my' 'the' 'to' 'you' 'your']
TF-IDF Matrix shape: (5572, 10)


In [20]:
import pandas as pd

feature_names = tfidf.get_feature_names_out()

tfidf_df = pd.DataFrame(
    X_tfidf[0].toarray(),
    columns=feature_names
)

tfidf_df


Unnamed: 0,and,for,in,is,me,my,the,to,you,your
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
import numpy as np

mean_tfidf = X_tfidf.mean(axis=0).A1
terms = tfidf.get_feature_names_out()

tfidf_scores = pd.DataFrame({
    "term": terms,
    "mean_tfidf": mean_tfidf
}).sort_values(by="mean_tfidf", ascending=False)

tfidf_scores


Unnamed: 0,term,mean_tfidf
7,to,0.184487
8,you,0.179376
6,the,0.115765
2,in,0.094187
3,is,0.086821
0,and,0.084766
4,me,0.082368
5,my,0.074408
1,for,0.069702
9,your,0.067706


In [24]:
X = df["clean_text"]
y = df["label"]

In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english")

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [27]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

0,1,2
,"alpha  alpha: float or array-like of shape (n_features,), default=1.0 Additive (Laplace/Lidstone) smoothing parameter (set alpha=0 and force_alpha=True, for no smoothing).",1.0
,"force_alpha  force_alpha: bool, default=True If False and alpha is less than 1e-10, it will set alpha to 1e-10. If True, alpha will remain unchanged. This may cause numerical errors if alpha is too close to 0. .. versionadded:: 1.2 .. versionchanged:: 1.4  The default value of `force_alpha` changed to `True`.",True
,"fit_prior  fit_prior: bool, default=True Whether to learn class prior probabilities or not. If false, a uniform prior will be used.",True
,"class_prior  class_prior: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",


In [29]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.968609865470852

In [30]:
sample_sms = ["Congratulations! You've won a $1,000 Walmart gift card. Click here to claim your prize"]

sample_tfidf = tfidf.transform(sample_sms)
model.predict(sample_tfidf)

array(['spam'], dtype='<U4')