In [17]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def load_emails_from_directory(directory):
    emails = []
    if not os.path.exists(directory):
        print(f"Directory does not exist: {directory}")
        return emails
    try:
        for root, _, files in os.walk(directory):
            for filename in files:
                filepath = os.path.join(root, filename)
                with open(filepath, 'r', encoding='latin-1') as file:
                    emails.append(file.read())
    except PermissionError as e:
        print(f"Permission error accessing directory {directory}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return emails

def load_data(spam_dir, easy_ham_dir, hard_ham_dir):
    # Load spam emails
    spam_emails = load_emails_from_directory(spam_dir)
    spam_labels = ['spam'] * len(spam_emails)
    
    # Load easy ham emails
    easy_ham_emails = load_emails_from_directory(easy_ham_dir)
    easy_ham_labels = ['ham'] * len(easy_ham_emails)
    
    # Load hard ham emails
    hard_ham_emails = load_emails_from_directory(hard_ham_dir)
    hard_ham_labels = ['ham'] * len(hard_ham_emails)
    
    # Combine all emails and labels
    all_emails = spam_emails + easy_ham_emails + hard_ham_emails
    all_labels = spam_labels + easy_ham_labels + hard_ham_labels
    
    return pd.DataFrame({'text': all_emails, 'label': all_labels})

# Define directories
spam_dir = 'C:/Users/Dell/Documents/ML/archive/spam_2'
easy_ham_dir = 'C:/Users/Dell/Documents/ML/archive/easy_ham'
hard_ham_dir = 'C:/Users/Dell/Documents/ML/archive/hard_ham'

# Load data
emails = load_data(spam_dir, easy_ham_dir, hard_ham_dir)

# Display the number of emails loaded
print(f"Loaded {len(emails)} emails.")
print(emails['label'].value_counts())

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(emails['text'], emails['label'], test_size=0.2, random_state=42)


Loaded 8398 emails.
label
ham     5604
spam    2794
Name: count, dtype: int64


In [18]:
# Vectorize text data
vectorizer = TfidfVectorizer(stop_words='english')
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [19]:
# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vect, y_train)

# Make predictions
y_pred = model.predict(X_test_vect)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='spam')
recall = recall_score(y_test, y_pred, pos_label='spam')
f1 = f1_score(y_test, y_pred, pos_label='spam')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Accuracy: 0.9404761904761905
Precision: 0.9829424307036247
Recall: 0.833634719710669
F1 Score: 0.9021526418786693


In [20]:
# Check predictions on some sample data from the test set
sample_texts = X_test[:5]
sample_labels = y_test[:5]
sample_predictions = model.predict(vectorizer.transform(sample_texts))

for text, label, prediction in zip(sample_texts, sample_labels, sample_predictions):
    print(f"Text: {text[:60]}...")  # Print the first 60 characters for brevity
    print(f"Actual Label: {label}")
    print(f"Predicted Label: {prediction}")
    print()


Text: From chol2001948@bellsouth.net  Wed Jun 27 08:46:51 2001
Ret...
Actual Label: spam
Predicted Label: ham

Text:     Mac OS X            	   2   °      â              ...
Actual Label: ham
Predicted Label: ham

Text:     Mac OS X            	   2   °      â              ...
Actual Label: spam
Predicted Label: spam

Text: From fork-admin@xent.com  Fri Sep  6 11:41:33 2002
Return-Pa...
Actual Label: ham
Predicted Label: ham

Text: From rssfeeds@jmason.org  Tue Sep 24 10:48:03 2002
Return-Pa...
Actual Label: ham
Predicted Label: ham



In [21]:
# Function to predict if a new email is spam or not
def predict_email(email_text):
    email_vect = vectorizer.transform([email_text])  # Vectorize the email text
    prediction = model.predict(email_vect)  # Predict using the trained model
    return prediction[0]

# Test the function with a new email
new_email = """
    Your email has won you $1,000,000! Click here to claim your prize. 
    This is not a scam. Please provide your bank details to transfer the money.
"""

prediction = predict_email(new_email)
print(f"The new email is classified as: {prediction}")


The new email is classified as: ham


In [22]:
import os 
import pandas as pd
from sklearn.model_selection import train_test_split



def load_email_dir(directory):
    emails = []
    if not os.path.exists(directory):
        print(f"The directory doesnt exist", {directory})
        return emails
    try:
        for root, _, files in os.walk(directory):
            for filename in files:
                filepath = os.path.join(root, filename)
                with open(filepath, 'r', encoding='latin-1') as file:
                    emails.append(file.read())
    except PermissionError as e:
        print(f"Permission error accessing directory {directory}: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return emails

        