In [3]:
import re
import string
import pandas as pd

In [4]:
df=pd.read_csv('spam.csv',encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
def clean_data(data):
    data = data.lower()  # Convert to lowercase
    data = re.sub(r'\d+', '', data)  # Remove digits
    data = data.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    data = re.sub(r'\W+', ' ', data)  # Remove words
    return data

In [6]:
cleaned_corpus = [clean_data(doc) for doc in df['message']]
print(cleaned_corpus)



In [7]:
from nltk.tokenize import word_tokenize

# Import nltk to manage downloads and resources
import nltk

# Download the 'punkt_tab' tokenizer model (needed for word_tokenize to work)
nltk.download('punkt_tab')


# Tokenize each document in the cleaned_corpus

tokenized_corpus = [word_tokenize(doc) for doc in cleaned_corpus]

# Print the tokenized corpus (list of lists of words)
print(tokenized_corpus)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Incorta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!




In [8]:
from nltk.corpus import stopwords

nltk.download('stopwords')

# Create a set of English, Spanish, French, German, and Italian stopwords for faster lookup
stop_words = set(stopwords.words('english') + stopwords.words('spanish') + stopwords.words('french') + stopwords.words('german') + stopwords.words('italian'))

# Remove stopwords from each document in the tokenized corpus
filtered_corpus = [[word for word in doc if word not in stop_words] for doc in tokenized_corpus]

# Print the corpus after stopword removal
print(filtered_corpus)



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Incorta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download the WordNet lexical database (needed for lemmatization)
nltk.download('wordnet')

# Initialize the stemmer (reduces words to their root form, e.g., "running" -> "run")
stemmer = PorterStemmer()

# Initialize the lemmatizer (reduces words to their base form using vocabulary & grammar rules, e.g., "better" -> "good")
lemmatizer = WordNetLemmatizer()


stemmed_corpus = [[stemmer.stem(word) for word in doc] for doc in filtered_corpus]


lemmatized_corpus = [[lemmatizer.lemmatize(word) for word in doc] for doc in filtered_corpus]

# Print the corpus after stemming
print(stemmed_corpus)

# Print the corpus after lemmatization
print(lemmatized_corpus)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Incorta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'great', 'world', 'buffet', 'cine', 'got', 'amor', 'wat'], ['ok', 'lar', 'joke', 'wif', 'u', 'oni'], ['free', 'entri', 'wkli', 'comp', 'win', 'fa', 'cup', 'final', 'tkt', 'st', 'may', 'text', 'fa', 'receiv', 'entri', 'questionstd', 'txt', 'ratetc', 'appli', 'over'], ['u', 'dun', 'say', 'earli', 'hor', 'u', 'alreadi', 'say'], ['nah', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'], ['freemsg', 'hey', 'darl', 'week', 'word', 'back', 'id', 'like', 'fun', 'still', 'tb', 'ok', 'xxx', 'std', 'chg', 'send', 'å', 'rcv'], ['even', 'brother', 'like', 'speak', 'treat', 'like', 'aid', 'patent'], ['request', 'mell', 'mell', 'oru', 'minnaminungint', 'nurungu', 'vettam', 'set', 'callertun', 'caller', 'press', 'copi', 'friend', 'callertun'], ['winner', 'valu', 'network', 'custom', 'select', 'receivea', 'å', 'prize', 'reward', 'claim', 'call', 'claim', 'code', 'kl', 'valid', 'hour'], ['mobil', 'month', 'u', 'r', 'entitl', 'updat', 'latest'

In [10]:
import contractions

# Apply contractions.fix() to each document in the cleaned_corpus
# This expands all contractions in every text document
expanded_corpus = [contractions.fix(doc) for doc in cleaned_corpus]

# Print the new corpus after expanding contractions
print(expanded_corpus)



In [11]:
# Option 1: TextBlob
from textblob import TextBlob
print(TextBlob("spamm mesagee").correct())

# Option 2: SymSpell (much faster)
from symspellpy.symspellpy import SymSpell, Verbosity

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)

suggestions = sym_spell.lookup("mesagee", Verbosity.CLOSEST, max_edit_distance=2)
for s in suggestions:
    print(s.term, s.distance, s.count)


2025-08-30 21:28:17,804: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


spasm message


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert labels: ham -> 0, spam -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)

# 3. TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=3000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
max_words = 5000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=100, input_length=max_len),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=64)



Epoch 1/5




[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 47ms/step - accuracy: 0.9343 - loss: 0.2294 - val_accuracy: 0.9758 - val_loss: 0.0935
Epoch 2/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.9890 - loss: 0.0456 - val_accuracy: 0.9830 - val_loss: 0.0615
Epoch 3/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - accuracy: 0.9944 - loss: 0.0224 - val_accuracy: 0.9848 - val_loss: 0.0535
Epoch 4/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.9978 - loss: 0.0100 - val_accuracy: 0.9857 - val_loss: 0.0517
Epoch 5/5
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - accuracy: 0.9984 - loss: 0.0063 - val_accuracy: 0.9848 - val_loss: 0.0622


<keras.src.callbacks.history.History at 0x203cd5b30e0>

In [14]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_tfidf, y_train)

# Predict
y_pred_rf = rf.predict(X_test_tfidf)

# Evaluation
print("Random Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Random Forest Results
Accuracy: 0.979372197309417
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.85      0.92       150

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_tfidf, y_train)

y_pred_model = model.predict(X_test_tfidf)

print("XGBoost Results")
print("Accuracy:", accuracy_score(y_test, y_pred_model))
print("Classification Report:\n", classification_report(y_test, y_pred_model))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Results
Accuracy: 0.9730941704035875
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.98       965
           1       0.95      0.85      0.89       150

    accuracy                           0.97      1115
   macro avg       0.96      0.92      0.94      1115
weighted avg       0.97      0.97      0.97      1115

