In [1]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [2]:
# Load data
df = pd.read_csv('data/SMSSpamCollection.csv', sep='\t', header=None, names=['label', 'message'])

# Data cleaning
df.drop_duplicates(inplace=True)
df = df.reset_index(drop=True)

# Feature engineering
df['message_length'] = df['message'].apply(len)
df['num_digits'] = df['message'].apply(lambda x: sum([1 if char.isdigit() else 0 for char in x]))
df['num_unique_words'] = df['message'].apply(lambda x: len(set(re.findall(r'\w+', x))))
df['stemmed_message'] = df['message'].apply(lambda x: ' '.join([WordNetLemmatizer().lemmatize(word) for word in x.split()]))

tfidf = TfidfVectorizer(stop_words='english', max_features=150)
tfidf_result = tfidf.fit_transform(df['stemmed_message']).toarray()
tfidf_feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf_feature_names)

# Save the vectorizer
with open('models/vec.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

df.drop(['message', 'stemmed_message'], axis=1, inplace=True)
df = pd.concat([df, tfidf_df], axis=1)

In [3]:
# Balance the dataset
ham = df[df['label'] == 'ham']
spam = df[df['label'] == 'spam']
ham_count, spam_count = len(ham), len(spam)
ham = ham.sample(spam_count, replace=True, random_state=42)
df = pd.concat([ham, spam], axis=0)

# Set the target variable to 0 or 1
df['label'] = df['label'].apply(lambda x: 0 if x == 'ham' else 1)

In [4]:
# Save the prepared data as csv
df.to_csv('data/PreparedMessages.csv', sep=';', index=False)