0. KÜTÜPHANE

In [71]:
# Importing required libraries

import os
import email
import email.policy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

1. Load the data from /dataset

In [72]:
# Spam and Ham Dataset Path

SPAM_DIR = "./dataset/spam"
HAM_DIR = "./dataset/ham"

In [47]:
# Extracting the file MIME format e-mail (spam and ham)

ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if name != 'cmds']
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if name != 'cmds']

In [74]:
len(spam_filenames), len(ham_filenames)

(500, 2500)

In [49]:
# Load the data from the directory

def load_emails(directory, filename):
    with open(os.path.join(directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)
    
ham_emails = [load_emails(HAM_DIR, filename=name) for name in ham_filenames]
spam_emails = [load_emails(SPAM_DIR, filename=name) for name in spam_filenames]

In [75]:
# Example of a spam email

spam = spam_emails[100]
print(spam.get_content())

<html>
<head>
<title>Digital Publishing Tools - Free Software Alert!</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body bgcolor="#FFFFFF" text="#000000">
<center>
<table width="582" border="2" cellspacing="0" cellpadding="5" bordercolor="#0077CC">
<tr>
<td colspan="3" width="582" align="center" bgcolor="#0077CC"><!5122qHWL1-032pyeM4045IIgM3-001oYhw0942jQSK5-726UDqG9283lEHR8-145EiGhl64>
<a href="http://3dpageturningebook.com" style="text-decoration:none;">
<b><font face="Verdana, Arial, Helvetica, sans-serif" size="4" color="#FFFFFF">Publish Like a Professional with Digital Publishing Tools</font></b>
</a>
</td>
</tr>

<tr>
<td colspan="1" width="204" valign="top">
<b><font face="Verdana Arial, Helvetica, sans-serif" color="#000066" size="2">Easily Create Professional:</font></b>
<font face="Verdana, Arial, Helvetica, sans-serif" size="1" color="#000066">
<ul>
<li>eBooks</li>
<li>eBrochures</li>
<li>eCatalogs</li>
<li>Resumes</li>
<li>Newslett

In [76]:
# Example of a ham email

ham = ham_emails[100]
print(ham.get_content())

Vernon,

I'm changing the instructions in the SpamAssassin INSTALL file 
right now to:

tar xfvz dcc-dccproc.tar.Z
cd dcc-dccproc-X.X.X
./configure && make && make install
cdcc 'info'


Let me know ASAP if that's innapropriate, since we're shipping 
2.40 today!

C

On Monday, September 2, 2002, at 10:02  AM, Vernon Schryver wrote:

>> Here are the instructions in the spamassassin README:
>>
>>     # tar xfvz dcc-dccproc.tar.Z
>>     # cd dcc-dccproc-X.X.X
>>     # ./configure && make && make install
>>     # cdcc 'new map'
>>     # cdcc 'add dcc.rhyolite.com'
>>     # cdcc 'info'
>
> That's ok, except that the 'new map' and "add dcc.rhyolite.com'
> are respectively unnecessary and wrong.  The map file that comes
> with the source points to localhost and dcc.dcc-servers.net.  Those
> two shipped entries usually do the right thing if there is a local
> server.  If there is no local server or if the local server fails,
> requests are instantly sent to one of the public server names listed

2. Preprocessing (Data cleaning + Feature Extraction)

In [52]:
# Traning and Testing dataset creation and split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [53]:
# Preprocessing raw data and saving cleaned data

import preprocess # preprocess.py
import importlib 
importlib.reload(preprocess)

train_cleaned = preprocess.temizleme(X_train)
test_cleaned = preprocess.temizleme(X_test)

In [78]:
# Example of a preprocessed spam email

spam_example = spam_emails[100:101]
spam_cleaned_example = preprocess.temizleme(spam_ornek)
spam_cleaned_example

['url url url url url digital publishing tool - free software alert ! publish like a professional with digital publishing tool easily create professional : ebooks ebrochures ecatalogs resume newsletter presentation magazine photo album invitation much , much more save money ! - save tree save on printing , postage and advertising cost digital publishing tool download new free version now ! * limited time offer choose from these display style : 3d page turn slide show sweep/wipe embed hyperlink and link to anywhere online , such a your website , order page or contact form . distribute via floppy , cd-rom , e-mail or online . take your marketing to the next level ! for more info , sample or a free download , click the appropriate link to the right ! server demand is extremely high for this limited time free software offer . please try these link periodically if a site seems slow or unreachable . website NUM website NUM website NUM if you wish to be removed from our mailing list , please 

____________________

In [63]:
# Feature Extraction using CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english', lowercase=False)
X_train_cv = cv.fit_transform(train_cleaned)
X_test_cv = cv.transform(test_cleaned)

# Converting sparse matrix to a dense matrix
X_train_arr = X_train_cv.toarray()
X_test_arr = X_test_cv.toarray()
X_test_arr

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [79]:
print(X_train_arr.shape)
print(X_test_arr.shape)


(2400, 30501)
(600, 30501)


Örnek Özellik Çıkarımı

In [80]:
# E-posta - Ön İşleme (CV)

spam_temiz_cv = cv.fit_transform(spam_ornek_temizleme)
cv.get_feature_names_out()

array(['3d', 'NUM', 'advertising', 'affiliate', 'album', 'alert',
       'appropriate', 'button', 'cd', 'choose', 'cick', 'click',
       'contact', 'copyright', 'cost', 'create', 'demand', 'digital',
       'display', 'distribute', 'download', 'easily', 'ebooks',
       'ebrochures', 'ecatalogs', 'embed', 'expiration', 'extremely',
       'floppy', 'form', 'free', 'fully', 'functional', 'ha', 'high',
       'hyperlink', 'id', 'info', 'invitation', 'level', 'like', 'limit',
       'limited', 'link', 'list', 'magazine', 'mail', 'mailing',
       'marketing', 'money', 'new', 'newsletter', 'offer', 'online',
       'order', 'page', 'periodically', 'photo', 'postage',
       'presentation', 'printing', 'professional', 'publish',
       'publishing', 'removed', 'resume', 'right', 'rom', 'sample',
       'save', 'server', 'site', 'slide', 'slow', 'software', 'spread',
       'style', 'sweep', 'time', 'tool', 'tree', 'try', 'turn',
       'unreachable', 'unsubscribe', 'url', 'version', 'websi

_____________________________

3. Training the classifier Models

In [66]:
# Logistic Regression Model from LogisticRegressionNumpy.py

import LogisticRegressionScratch as LogisticRegression
import importlib
importlib.reload(LogisticRegression)


LR = LogisticRegression.LogisticRegression()

LR.fit(X_train_arr, y_train)
y_pred_LR = LR.predict(X_test_arr)




In [67]:
# Multinomial Naive Bayes Model from MultinomialNBScratch.py

import MultinomialNBScratch as MultinomialNB
importlib.reload(MultinomialNB)


MNB = MultinomialNB.MultinomialNB()

MNB.fit(X_train_arr,y_train)
y_pred_MNB = MNB.predict(X_test_arr)



In [68]:
# Bernoulli Naive Bayes from BernoulliNBScratch.py

import BernoulliNBScratch as BernoulliNB
importlib.reload(BernoulliNB)


BNB = BernoulliNB.BernoulliNB()

BNB.fit(X_train_arr,y_train)
y_pred_BNB = BNB.predict(X_test_arr)

4. Evaluation

In [None]:
from sklearn.metrics import classification_report
from tabulate import tabulate
from sklearn.metrics import accuracy_score

rep_LR = classification_report(y_test, y_pred_LR, output_dict=True)
rep_MNB = classification_report(y_test, y_pred_MNB, output_dict=True)
rep_BNB = classification_report(y_test, y_pred_BNB, output_dict=True)

# Extract relevant metrics for class '0' from the classification reports
lr_metrics_0 = rep_LR['0']
mlt_metrics_0 = rep_MNB['0']
ber_metrics_0 = rep_BNB['0']

# Extract relevant metrics for class '1' from the classification reports
lr_metrics_1 = rep_LR['1']
mlt_metrics_1 = rep_MNB['1']
ber_metrics_1 = rep_BNB['1']

# Format the metrics as a table
table = [['', 'Precision', 'Recall', 'F1-Score', 'Support'],
         ['Logistic Regression (ham)', lr_metrics_0['precision'], lr_metrics_0['recall'], lr_metrics_0['f1-score'], lr_metrics_0['support']],
         ['Logistic Regression (spam)', lr_metrics_1['precision'], lr_metrics_1['recall'], lr_metrics_1['f1-score'], lr_metrics_1['support']],
         ['Multinomial Naive Bayes (ham)', mlt_metrics_0['precision'], mlt_metrics_0['recall'], mlt_metrics_0['f1-score'], mlt_metrics_0['support']],
         ['Multinomial Naive Bayes (spam)', mlt_metrics_1['precision'], mlt_metrics_1['recall'], mlt_metrics_1['f1-score'], mlt_metrics_1['support']],
         ['Bernoulli Naive Bayes (ham)', ber_metrics_0['precision'], ber_metrics_0['recall'], ber_metrics_0['f1-score'], ber_metrics_0['support']],
         ['Bernoulli Naive Bayes (spam)', ber_metrics_1['precision'], ber_metrics_1['recall'], ber_metrics_1['f1-score'], ber_metrics_1['support']]]

# Print the table side by side
print(tabulate(table, headers='firstrow', tablefmt='pipe'))

acc_LR = accuracy_score(y_pred_LR, y_test)
acc_MNB = accuracy_score(y_pred_MNB, y_test)
acc_BNB = accuracy_score(y_pred_BNB, y_test)
print()
print("\033[1mAccuracy\033[0m")
print(f"Logistic Regression: {acc_LR}")
print(f"Multinomial Naive Bayes: {acc_MNB}")
print(f"Bernoulli Naive Bayes: {acc_BNB}")

In [None]:
# Confusion Matrix

cf_matrix_LR = confusion_matrix(y_test, y_pred_LR)

cf_matrix_MNB = confusion_matrix(y_test, y_pred_MNB)

cf_matrix_BNB = confusion_matrix(y_test, y_pred_BNB)

# Create subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))


axes[0].set_title('Confusion Matrix - Logistic Regression')
sns.heatmap(cf_matrix_LR, annot=True, fmt='', cmap='Blues', ax=axes[0])


axes[1].set_title('Confusion Matrix - Multinomial Naive Bayes')
sns.heatmap(cf_matrix_MNB, annot=True, fmt='', cmap='Oranges', ax=axes[1])


axes[2].set_title('Confusion Matrix - Bernoulli Naive Bayes')
sns.heatmap(cf_matrix_BNB, annot=True, fmt='', cmap='Greens', ax=axes[2])

# Adjust the layout and spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()
