In [1]:
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import csv

# Reading the file

In [5]:
df = pd.read_excel(
    '/SpamHamData.xlsx',
    header=None,
    names=["Label", "Message"]
)
df

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
print(df.isnull().sum())

Label      0
Message    0
dtype: int64


There is no null values, so now lets check the ratio of spam and ham

In [9]:
ham_count = (df["Label"] == "ham").sum()
spam_count = (df["Label"] == "spam").sum()

print(f"Count of Ham messages: {ham_count}")
print(f"Count of Spam messages: {spam_count}")
print(f"Total messages: {len(df)}")

Count of Ham messages: 4825
Count of Spam messages: 747
Total messages: 5572


In [8]:
print(f"Percentage of Ham messages: {ham_count / (ham_count + spam_count) * 100:.2f}%")
print(f"Percentage of Spam messages: {spam_count / (ham_count + spam_count) * 100:.2f}%")

Percentage of Ham messages: 86.59%
Percentage of Spam messages: 13.41%


So all messages are labelled as either Ham or Spam. The ratio is as above.

# Preprocessing

In [13]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
def clean_text(text):

    """ This converts the text to lowercase,
    tokenizes it, and removes stopwords, punctuation,
    and non-alphanumeric words"""

    text = str(text)
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    filtered_tokens = []
    for token in tokens:
        if token.isalnum() and token not in stopwords.words('english') and token not in string.punctuation:
            filtered_tokens.append(token)
    cleaned_text = ' '.join(filtered_tokens)

    return cleaned_text

In [18]:
df['processed msg'] = df['Message'].map(clean_text)
df

Unnamed: 0,Label,Message,processed msg
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u pound prize 2 claim...
5568,ham,Will ü b going to esplanade fr home?,ü b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [20]:
print(df.isnull().sum())

Label            0
Message          0
processed msg    0
dtype: int64


# Splitting into train and test

In [22]:
from sklearn.model_selection import train_test_split
""" Splitting the data: 14:3:3 ratio → 70% train, 15% validation, 15% test """

df = df[["Label", "processed msg"]]

train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42, stratify=df["Label"])
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data["Label"])

print(f"Train set: {len(train_data)}")
print(f"Validation set: {len(val_data)}")
print(f"Test set: {len(test_data)}")

Train set: 3900
Validation set: 836
Test set: 836


In [23]:
# Save the datasets to CSV files
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

In [24]:
from google.colab import files

files.download("train_data.csv")
files.download("val_data.csv")
files.download("test_data.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>