In [None]:
#######################################################################################
# Adapted from: https://www.youtube.com/watch?v=nkPNQk4-3UE
#######################################################################################

import string
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

nltk.download('stopwords')


In [None]:
df = pd.read_csv('spam_ham_dataset.csv')
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))
df.info()

In [None]:
stemmer = PorterStemmer()
corpus = []

stopwords_set = set(stopwords.words('english'))

In [None]:
# Prepare email text 
def prepare_email_text(email_text):
    email_text = email_text.lower().translate(str.maketrans('', '', string.punctuation)).split()
    email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
    email_text = ' '.join(email_text)
    return email_text

In [None]:
for i in range(len(df)):
    text = prepare_email_text(df['text'].iloc[i])
    corpus.append(text)

In [None]:
vectorizer = CountVectorizer()

x = vectorizer.fit_transform(corpus).toarray()
y = df.label_num

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2)

In [None]:
clf = RandomForestClassifier(n_jobs=-1)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
email_to_classify = prepare_email_text(df.text.values[10])
email_corpus = [email_to_classify]
x_email = vectorizer.transform(email_corpus)


In [None]:
clf.predict(x_email)

In [None]:
df.label_num.iloc[10]

In [None]:
email_data = []
with open("emails.txt", "r", encoding="utf-8") as f:
    email_data = [line.strip() for line in f if line.strip()]

for text in email_data:
    # preprocess the email text
    processed_text = prepare_email_text(text)

    # turn into vector
    x_email = vectorizer.transform([processed_text])

    # predict label (e.g., ['ham'] or ['spam'])
    pred = clf.predict(x_email)[0]

    # print prediction + first 100 chars of raw email
    preview = text[:100] + ("..." if len(text) > 100 else "")
    print(f"{pred} → {preview}")


