In [2]:
%pip install numpy pandas
import numpy as np
import pandas as pd

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [3]:
# https://archive.ics.uci.edu/dataset/94/spambase
df = None
file_name = 'spambase.data'
path = './datasets/spambase/' + file_name
try:
    df = pd.read_csv(path, header=None)
except FileNotFoundError:
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/' + file_name
    df = pd.read_csv(path, header=None)

if df.empty:
   raise Exception('Dataframe is empty')

print(df.head())
print(df.describe())

     0     1     2    3     4     5     6     7     8     9   ...    48  \
0  0.00  0.64  0.64  0.0  0.32  0.00  0.00  0.00  0.00  0.00  ...  0.00   
1  0.21  0.28  0.50  0.0  0.14  0.28  0.21  0.07  0.00  0.94  ...  0.00   
2  0.06  0.00  0.71  0.0  1.23  0.19  0.19  0.12  0.64  0.25  ...  0.01   
3  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   
4  0.00  0.00  0.00  0.0  0.63  0.00  0.31  0.63  0.31  0.63  ...  0.00   

      49   50     51     52     53     54   55    56  57  
0  0.000  0.0  0.778  0.000  0.000  3.756   61   278   1  
1  0.132  0.0  0.372  0.180  0.048  5.114  101  1028   1  
2  0.143  0.0  0.276  0.184  0.010  9.821  485  2259   1  
3  0.137  0.0  0.137  0.000  0.000  3.537   40   191   1  
4  0.135  0.0  0.135  0.000  0.000  3.537   40   191   1  

[5 rows x 58 columns]
                0            1            2            3            4   \
count  4601.000000  4601.000000  4601.000000  4601.000000  4601.000000   
mean      0.104553     0

In [4]:
# Data exploration
is_na = df.isna().any().any()
print('Is there any NA? ', is_na)
# Il n'y a pas de valeurs manquantes

Is there any NA?  False


In [5]:
# Features and labels
x = df.drop(df.columns[-1], axis=1).values
y = df[df.columns[-1]].values

In [6]:
# Split data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)

# Train model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(x_train, y_train)

In [7]:
# Normalize the data to remove (in)significant features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [8]:
# Predictions on test data
y_pred = model.predict(x_test)
print(y_pred)

[1 0 0 ... 0 1 0]


In [9]:
# Prediction on custom data
from api import extract
features = extract("Tu as gagné 1 million , envoie ton RIB pour recevoir ton gain!!!")
y_pred = model.predict(features)
print(y_pred)

test_features = extract("You have won 1 MILLION $, please send you ADDRESS !!!")
y_pred = model.predict(test_features)
print(y_pred)

test_features = extract("Has ganado 1 MILLÓN de dinero, por favor envíate la DIRECCIÓN !!!")
y_pred = model.predict(test_features)
print(y_pred)

Counts: [0, 1, 1, 0]
Language detected: fr
Features extracted
[0]
Counts: [3, 0, 0, 0]
Language detected: en
Features extracted
[1]
Counts: [0, 0, 1, 0]
Language detected: es
Features extracted
[1]


In [10]:
# Save model
import joblib
joblib.dump(model, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')
print('Model saved as model.pkl')
print('Scaler saved as scaler.pkl')

# Load model
model = joblib.load("model.pkl")
scaler = joblib.load("scaler.pkl")
print("Model loaded from model.pkl")
print("Scaler loaded from scaler.pkl")

# Predict if the model is still working
y_pred = model.predict(x_test)
print("Accuracy: ", model.score(x_test, y_test))

Model saved as model.pkl
Scaler saved as scaler.pkl
Model loaded from model.pkl
Scaler loaded from scaler.pkl
Accuracy:  0.7965242577842143


In [11]:
# https://spamassassin.apache.org/old/publiccorpus/

import tarfile
import os
try:
    if not os.path.exists('./datasets/spam/'):
        raise FileNotFoundError
except FileNotFoundError:
    url = (
        'https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2'
    )
    os.system('wget ' + url)
    tar = tarfile.open(file_name)
    tar.extractall('./datasets/')
    tar.close()

try:
    url = (
        'https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'
    )
    os.system('wget ' + url)
    tar = tarfile.open(file_name)
    tar.extractall('./datasets/')
    tar.close()
except FileNotFoundError:
    pass


--2024-11-30 15:29:06--  https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
Resolving spamassassin.apache.org (spamassassin.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to spamassassin.apache.org (spamassassin.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1183768 (1.1M) [application/x-bzip2]
Saving to: ‘20030228_spam.tar.bz2.2’

     0K .......... .......... .......... .......... ..........  4% 1.04M 1s
    50K .......... .......... .......... .......... ..........  8% 3.45M 1s
   100K .......... .......... .......... .......... .......... 12% 11.7M 0s
   150K .......... .......... .......... .......... .......... 17% 6.75M 0s
   200K .......... .......... .......... .......... .......... 21% 11.3M 0s
   250K .......... .......... .......... .......... .......... 25% 7.56M 0s
   300K .......... .......... .......... .......... .......... 30% 8.64M 0s
   350K .......... .......... .......... .......... 

In [20]:
# Load spam data and path = "./datasets/spam_train/" and make a train_test_split
path = './datasets/spam/'
email_spam = []

for file in os.listdir(path):
    with open(path + file, 'r', errors='ignore') as f:
        email_spam.append(f.read())

# Extract features and make a train_test_split
# All the data is spam
features = []
for email in email_spam:
    features.append(extract(email))

features = np.array(features).reshape(len(features), -1)
labels = np.array([1] * len(features))

x_train_spam, x_test_spam, y_train_spam, y_test_spam = train_test_split(features, labels, test_size=0.3, random_state=0)


Counts: [2, 2, 2, 2]
Language detected: en
Features extracted
Counts: [17, 1, 0, 1]
Language detected: en
Features extracted
Counts: [4, 0, 0, 0]
Language detected: en
Features extracted
Counts: [4, 0, 0, 0]
Language detected: en
Features extracted
Counts: [32, 3, 2, 3]
Language detected: en
Features extracted
Counts: [4, 0, 0, 0]
Language detected: en
Features extracted
Counts: [37, 3, 0, 3]
Language detected: en
Features extracted
Counts: [7, 0, 0, 0]
Language detected: en
Features extracted
Counts: [12, 1, 0, 1]
Language detected: en
Features extracted
Counts: [48, 4, 0, 4]
Language detected: en
Features extracted
Counts: [13, 4, 1, 3]
Language detected: en
Features extracted
Counts: [7, 0, 0, 0]
Language detected: en
Features extracted
Counts: [1, 0, 0, 0]
Language detected: en
Features extracted
Counts: [5, 0, 0, 0]
Language detected: en
Features extracted
Counts: [11, 1, 0, 1]
Language detected: en
Features extracted
Counts: [8, 1, 1, 1]
Language detected: en
Features extracted
C

In [27]:
model.fit(x_train_spam, y_train_spam)
y_pred = model.predict(x_test_spam)

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
)

# Assuming your labels are 0 for non-spam and 1 for spam
all_labels = [0, 1]

# Generate the confusion matrix with specified labels
cm = confusion_matrix(y_test_spam, y_pred, labels=all_labels)
print("Confusion Matrix:")
print(cm)

# Evaluate the model
accuracy = accuracy_score(y_test_spam, y_pred)
precision = precision_score(y_test_spam, y_pred, pos_label=1)
recall = recall_score(y_test_spam, y_pred, pos_label=1)
f1 = f1_score(y_test_spam, y_pred, pos_label=1)

print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1 Score: {f1 * 100:.2f}%")

# Generate a classification report
print("\nClassification Report:")
print(classification_report(y_test_spam, y_pred, labels=all_labels))

from api import extract

features = extract("Coucou maman")
y_pred = model.predict(features)
if y_pred == 1:
    print("Fuck overfeating")
else: 
    print("Winning")

Confusion Matrix:
[[  0   0]
 [  0 151]]
Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1 Score: 100.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      1.00      1.00       151

    accuracy                           1.00       151
   macro avg       0.50      0.50      0.50       151
weighted avg       1.00      1.00      1.00       151

Counts: [0, 0, 0, 0]
Features extracted
[1]
