In [1]:
%pip install numpy pandas
import numpy as np
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [6]:
import pandas as pd
import os

df = None
file_name = "spambase.data"
path = "./datasets/spambase/" + file_name

# Check if directory exists, if not create it
os.makedirs("./datasets/spambase", exist_ok=True)

try:
    df = pd.read_csv(path, header=None)
    print(f"Loaded data from {path}")
except FileNotFoundError:
    try:
        url = (
            "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/"
            + file_name
        )
        df = pd.read_csv(url, header=None)
        df.to_csv(path, index=False, header=False)
        print(f"Downloaded and saved data from {url}")
    except Exception as e:
        print(f"Failed to download the file: {e}")
        raise

# Check if the dataframe is empty
if df.empty:
    raise Exception("Dataframe is empty")

# Print head and describe
print(df.head())
print(df.describe())

Downloaded and saved data from https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data
     0     1     2    3     4     5     6     7     8     9   ...    48  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00  ...  0.00   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...  0.00   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...  0.01   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
4  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   

      49   50     51     52     53     54   55    56  57  
0  0.000  0.0  0.778  0.000  0.000  3.756   61   278   1  
1  0.132  0.0  0.372  0.180  0.048  5.114  101  1028   1  
2  0.143  0.0  0.276  0.184  0.010  9.821  485  2259   1  
3  0.137  0.0  0.137  0.000  0.000  3.537   40   191   1  
4  0.135  0.0  0.135  0.000  0.000  3.537   40   191   1  

[5 rows x 58 columns]
                0            1            2            3    

In [7]:
# Data exploration
is_na = df.isna().any().any()
print('Is there any NA? ', is_na)
# Il n'y a pas de valeurs manquantes

Is there any NA?  False


In [8]:
# Features and labels
x = df.drop(df.columns[-1], axis=1).values
y = df[df.columns[-1]].values

In [9]:
# Split data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)

In [10]:
# Normalize the data to remove (in)significant features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [11]:
# Predictions on test data
y_pred = model.predict(x_test)
print(y_pred)

[1 0 0 ... 0 1 0]


In [12]:
# Prediction on custom data
from api import extract_2d
features = extract_2d("Tu as gagné 1 million , envoie ton RIB pour recevoir ton gain!!!")
y_pred = model.predict(features)
print(y_pred)

test_features = extract_2d("You have won 1 MILLION $, please send you ADDRESS !!!")
y_pred = model.predict(test_features)
print(y_pred)

test_features = extract_2d(
    "You won a trip to LAS VEGAS, please send your rib to receive your gain!!!"
)
y_pred = model.predict(test_features)
print(y_pred)

test_features = extract_2d(
    "Has ganado 1 MILLÓN de dinero, por favor envíate la DIRECCIÓN !!!"
)
y_pred = model.predict(test_features)
print(y_pred)

[0]
[1]
[1]
[1]


In [9]:
# Save model
# import joblib
# joblib.dump(model, 'model.pkl')
# joblib.dump(scaler, 'scaler.pkl')
# print('Model saved as model.pkl')
# print('Scaler saved as scaler.pkl')

# Load model
# model = joblib.load("model.pkl")
# scaler = joblib.load("scaler.pkl")
# print("Model loaded from model.pkl")
# print("Scaler loaded from scaler.pkl")

# Predict if the model is still working
# y_pred = model.predict(x_test)
# print("Accuracy: ", model.score(x_test, y_test))

In [13]:
import tarfile
import os

# Create directories for spam and ham datasets
os.makedirs("./datasets/spam", exist_ok=True)
os.makedirs("./datasets/ham", exist_ok=True)

# URLs for the datasets
datasets = {
    "spam": ["20021010_spam.tar.bz2", "20030228_spam.tar.bz2"],
    "ham": [
        "20021010_easy_ham.tar.bz2",
        "20021010_hard_ham.tar.bz2",
        "20030228_easy_ham.tar.bz2",
    ],
}


# Function to download and extract datasets
def download_and_extract(file_name, dataset_type):
    url = "https://spamassassin.apache.org/old/publiccorpus/" + file_name
    os.system(f"wget {url} -P ./datasets/{dataset_type}")
    tar = tarfile.open(f"./datasets/{dataset_type}/{file_name}")
    tar.extractall(f"./datasets/{dataset_type}")
    tar.close()

# Download and extract spam datasets
for file in datasets["spam"]:
    try:
        # Check if the file is already downloaded
        if not os.path.exists(f"./datasets/spam/{file}"):
            download_and_extract(file, "spam")    
    except FileNotFoundError:
        print(f"File {file} not found")

# Download and extract ham datasets
for file in datasets["ham"]:
    try:
        if not os.path.exists(f"./datasets/ham/{file}"):
            download_and_extract(file, "ham")
    except FileNotFoundError:
        print(f"File {file} not found")

--2024-12-01 15:02:51--  https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1192582 (1.1M) [application/x-bzip2]
Saving to: ‘./datasets/spam/20021010_spam.tar.bz2’

     0K .......... .......... .......... .......... ..........  4% 1.82M 1s
    50K .......... .......... .......... .......... ..........  8% 2.62M 0s
   100K .......... .......... .......... .......... .......... 12% 8.03M 0s
   150K .......... .......... .......... .......... .......... 17% 6.00M 0s
   200K .......... .......... .......... .......... .......... 21% 3.93M 0s
   250K .......... .......... .......... .......... .......... 25% 3.85M 0s
   300K .......... .......... .......... .......... .......... 30% 7.44M 0s
   350K .......... .......... ........

In [14]:
# All the data is spam

import numpy as np
import re
import os
from api import extract_1d, extract_2d, prioritize_features
from sklearn.model_selection import train_test_split


def is_html_content(email_body):
    """
    Check if the email body contains a significant amount of HTML content.
    This function filters out emails with minimal HTML.
    """
    if (
        re.search(r"<[^>]+>", email_body) and email_body.count("<") > 20
    ):  # Adjust threshold if needed
        return True
    return False


def extract_email_body(email):
    """
    Extract the main content of an email by removing headers.
    If no clear separation is found, return the entire email content.
    """
    parts = email.split("\n\n", 1)
    if len(parts) == 2:
        _, body = parts
    else:
        body = email  # Return the entire content if no clear separation is found
    body = body.strip()
    return body

def load_emails(paths):
    emails = []
    for path in paths:
        for file in os.listdir(path):
            file_path = os.path.join(path, file)
            if os.path.isfile(file_path):
                with open(file_path, "r", errors="ignore") as f:
                    email_content = f.read()
                    if not is_html_content(email_content):
                        emails.append(extract_email_body(email_content))
    return emails


# Path to spam and ham datasets
spam_path = ["./datasets/spam/spam"]
ham_paths = ["./datasets/ham/easy_ham", "./datasets/ham/hard_ham"]
# Load spam emails
spam_emails = load_emails(spam_path)
ham_emails = load_emails(ham_paths)

print(f"Number of spam emails: {len(spam_emails)}")
print(f"Number of ham emails: {len(ham_emails)}")

spam_features = np.array([extract_1d(email) for email in spam_emails])
ham_features = np.array([extract_1d(email) for email in ham_emails])

# On priorise les features les plus importantes
spam_features = prioritize_features(spam_features)

print(f'Spam features: {spam_features.shape}')
print(f'Ham features: {ham_features.shape}')

# Labels: 1 for spam, 0 for ham
spam_labels = np.ones(len(spam_features))
ham_labels = np.zeros(len(ham_features))

# Combine spam and ham features and labels using np.concatenate
all_features = np.concatenate((spam_features, ham_features), axis=0) 
all_labels = np.concatenate((spam_labels, ham_labels), axis=0)

# Display features and labels for verification
print("Sample of Spam Features:") 
print(spam_features[:3]) 
print("Sample of Ham Features:") 
print(ham_features[:3]) 
print("Labels:") 
print(all_labels[:20])

# Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(
    all_features, all_labels, test_size=0.2, random_state=0
)

Number of spam emails: 531
Number of ham emails: 4637
Spam features: (531, 57)
Ham features: (4637, 57)
Sample of Spam Features:
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 3.85894877e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 6.91949434e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 4.65735196e-01
  0.00000000e+00 0.00000000e+00 0.00000000e+00 3.99201597e-01
  0.00000000e+00 2.66134398e-01 5.52228876e+00 0.00000000e+00
  6.65335995e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  0

In [19]:
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [20]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
from api import extract_2d


from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

# Assuming your labels are 0 for non-spam and 1 for spam
all_labels = [0, 1]

# Generate the confusion matrix with specified labels
cm = confusion_matrix(y_test, y_pred, labels=all_labels)
print("Confusion Matrix:")
print(cm)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, labels=all_labels))

# New sample inputs for prediction
emails_to_test = [
    "You have won a free trip to Paris! Reply to this email to claim your prize GO CHECK YOUR EMAIL NOW",
    "Coucou maman",
    "You have won 1 million dollars! Please send your bank details to claim your prize.",
    "Your account has been suspended. Please verify your identity to restore access.",
    "Salut je vais te hacker ramnène moi 500 EUROS dans les 24 heures!!!!!!",
    "Meeting tomorrow at 10 AM. Please confirm your attendance.",
    "Your order has been confirmed. Click the link to track your delivery.",
]

# Predict and display results
for email in emails_to_test:
    features = extract_2d(email)
    y_pred = model.predict(features)
    if y_pred == 1:
        print(f"'{email}' is classified as SPAM.")
    else:
        print(f"'{email}' is classified as NOT SPAM.")

spam_mean_features = np.mean(spam_features, axis=0)
ham_mean_features = np.mean(ham_features, axis=0)

print("Mean Features for Spam Emails:")
print(spam_mean_features)
print("Mean Features for Ham Emails:")
print(ham_mean_features)

Confusion Matrix:
[[917   2]
 [  0 115]]
Accuracy: 99.81%
Precision: 98.29%
Recall: 100.00%
F1 Score: 99.14%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       919
           1       0.98      1.00      0.99       115

    accuracy                           1.00      1034
   macro avg       0.99      1.00      1.00      1034
weighted avg       1.00      1.00      1.00      1034

'You have won a free trip to Paris! Reply to this email to claim your prize GO CHECK YOUR EMAIL NOW' is classified as SPAM.
'Coucou maman' is classified as NOT SPAM.
'You have won 1 million dollars! Please send your bank details to claim your prize.' is classified as NOT SPAM.
'Your account has been suspended. Please verify your identity to restore access.' is classified as NOT SPAM.
'Salut je vais te hacker ramnène moi 500 EUROS dans les 24 heures!!!!!!' is classified as SPAM.
'Meeting tomorrow at 10 AM. Please confirm your attendanc