**1. Téléchargement et Extraction du Dataset**

In [17]:
# Télécharger le dataset Lingspam depuis Kaggle
!kaggle datasets download -d mandygu/lingspam-dataset --force

# Extraire le dataset
import zipfile
import os

with zipfile.ZipFile('/content/lingspam-dataset.zip', 'r') as zip_ref:
  zip_ref.extractall('kaggle_spam_ds')
import pandas as pd

kaggle_df = pd.read_csv('kaggle_spam_ds/messages.csv')
kaggle_df.head()




Dataset URL: https://www.kaggle.com/datasets/mandygu/lingspam-dataset
License(s): unknown
Downloading lingspam-dataset.zip to /content
  0% 0.00/3.12M [00:00<?, ?B/s]
100% 3.12M/3.12M [00:00<00:00, 97.6MB/s]


Unnamed: 0,subject,message,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


**2. Chargement et Prétraitement des Emails**

In [18]:
import os
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Télécharger les stopwords
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Définir le dossier contenant les emails
dataset_folder = "lingspam_dataset"

# Initialiser les listes d'emails et labels
emails = []
labels = []

# Fonction de nettoyage des emails
def clean_email(text):
    text = text.lower()
    text = re.sub(r"subject:.*", "", text)  # Supprimer l’en-tête "Subject:"
    text = re.sub(r"<.*?>", "", text)  # Supprimer le HTML
    text = re.sub(r"https?://\S+|www\.\S+", "", text)  # Supprimer les URLs
    text = re.sub(r"[^\w\s]", "", text)  # Supprimer la ponctuation
    text = re.sub(r"\d+", "", text)  # Supprimer les chiffres
    return " ".join(word for word in text.split() if word not in stop_words)  # Supprimer les stopwords

# Charger les emails du dataset
for root, dirs, files in os.walk(dataset_folder):
    for file in files:
        if file.endswith(".txt"):  # Vérifier que c'est bien un email
            with open(os.path.join(root, file), "r", encoding="latin-1") as f:
                content = f.read()
                label = 1 if "spmsg" in file else 0  # Spam si "spmsg" est dans le nom
                emails.append(clean_email(content))
                labels.append(label)

# Convertir en DataFrame
df = pd.DataFrame({"email": emails, "label": labels})
print(f" Nombre total d'emails : {df.shape[0]}")


 Nombre total d'emails : 0


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**3. Tokenisation avec BERT**

In [19]:
import torch

# Vérifier si GPU disponible
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

emails = kaggle_df['message']
labels = kaggle_df['label']

X_train, X_test, y_train, y_test = train_test_split(emails, labels, test_size=0.2, random_state=42)

# Tokenisation des emails
from tqdm import tqdm

tokenized_inputs = []
for email in tqdm(X_train):
    inputs = tokenizer(email, return_tensors='pt', truncation=True, max_length=512, padding="max_length")
    tokenized_inputs.append(inputs["input_ids"].squeeze(0))

labels = torch.tensor(y_train.values, dtype=torch.float32)

# Créer un DataLoader
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(torch.stack(tokenized_inputs), labels)
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)


100%|██████████| 2314/2314 [00:35<00:00, 65.42it/s]


**4. Création du Modèle Deep RNN**

In [20]:
from torch import nn

class DeepRNNClassifier(nn.Module):
    def __init__(self, vocab_size, hidden_dim, output_dim, embedding_dim=256, dropout=0.5):
        super(DeepRNNClassifier, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        # Deep RNN avec 3 couches
        self.rnn1 = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)
        self.rnn2 = nn.RNN(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)
        self.rnn3 = nn.RNN(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)

        # Fully Connected Layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim),
            nn.Sigmoid()  # Activation sigmoïde pour la classification binaire
        )

    def forward(self, input_sequence):
        embeddings = self.embedding(input_sequence)

        # Passage à travers 3 couches RNN
        hidden_states, _ = self.rnn1(embeddings)
        hidden_states, _ = self.rnn2(hidden_states)
        hidden_states, _ = self.rnn3(hidden_states)

        # Prendre la dernière sortie du RNN
        output = hidden_states[:, -1, :]  # (batch_size, hidden_dim)
        output = self.fc(output)

        return output


**5. Entraînement du Modèle**

In [21]:
# Définition des hyperparamètres
input_dim = tokenizer.vocab_size + 1
hidden_dim = 256
output_dim = 1
dropout = 0.5

# Initialiser le modèle
model = DeepRNNClassifier(input_dim, hidden_dim, output_dim, dropout=dropout).to(device)

# Définir la perte et l'optimiseur
bce = nn.BCELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=1.00004)

# Boucle d'entraînement
losses = []
n_epochs = 5

for epoch in range(n_epochs):
    print(f"Epoch {epoch+1}/{n_epochs}")
    epoch_loss = 0.0

    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        # Forward
        output_y = model(batch_X)
        output_y = output_y.squeeze(1)

        # Calcul de la perte
        loss = bce(output_y, batch_y.float())

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Ajouter la perte
        epoch_loss += loss.item()

    scheduler.step()
    epoch_loss /= len(train_loader)
    losses.append(epoch_loss)
    print(f"Loss: {epoch_loss:.4f}")


Epoch 1/5
Loss: 0.4747
Epoch 2/5
Loss: 0.4202
Epoch 3/5
Loss: 0.3917
Epoch 4/5
Loss: 0.3574
Epoch 5/5
Loss: 0.3680
