In [96]:
import pandas as pd

df = pd.read_csv('../dataset.csv')
df.drop("Unnamed: 0", axis=1, inplace=True)
df.head(1)


Unnamed: 0,url,title,description,cat1,cat2,cat3
0,https://www.tourisme-cambresis.fr/1-les-templi...,"Aventure-jeu : ""Les Templiers du coffre d'or""",Le jeu aventure « Les templiers du coffre d’or...,Jeu,Famille,Détente


In [97]:
def extract_domain(url):
    url = url[url.find('//')+2:]
    url = url[:url.find('/')]
    return url

df['domain'] = df['url'].apply(extract_domain)

print("nb ligne df", len(df))

df = df.dropna(subset=['description'])

print("nb ligne df1", len(df))

df1 = df.copy()
df2 = df.dropna(subset=['cat2'])
df3 = df.dropna(subset=['cat3'])

print("nb ligne df2", len(df2))
print("nb ligne df3", len(df3))

df[['domain', 'url']].head(1)


nb ligne df 391
nb ligne df1 390
nb ligne df2 317
nb ligne df3 134


Unnamed: 0,domain,url
0,www.tourisme-cambresis.fr,https://www.tourisme-cambresis.fr/1-les-templi...


In [98]:
# Concaténer les colonnes 'title' et 'description' pour former les textes
df['text'] = df['domain'] + " | " + df['title'] + " " + df['description']

df.head(1)


Unnamed: 0,url,title,description,cat1,cat2,cat3,domain,text
0,https://www.tourisme-cambresis.fr/1-les-templi...,"Aventure-jeu : ""Les Templiers du coffre d'or""",Le jeu aventure « Les templiers du coffre d’or...,Jeu,Famille,Détente,www.tourisme-cambresis.fr,"www.tourisme-cambresis.fr | Aventure-jeu : ""Le..."


In [99]:
from sklearn.preprocessing import LabelEncoder

# Encoder les catégories
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['cat1'])

df.head(1)


Unnamed: 0,url,title,description,cat1,cat2,cat3,domain,text,label
0,https://www.tourisme-cambresis.fr/1-les-templi...,"Aventure-jeu : ""Les Templiers du coffre d'or""",Le jeu aventure « Les templiers du coffre d’or...,Jeu,Famille,Détente,www.tourisme-cambresis.fr,"www.tourisme-cambresis.fr | Aventure-jeu : ""Le...",14


In [100]:
from sklearn.model_selection import train_test_split

# Séparer les données en ensemble d'entraînement et de test
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'], df['label'], test_size=0.2)


In [101]:
from torch.utils.data import Dataset
import torch

# Définir un Dataset personnalisé
class EventDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [102]:
from torch.nn.utils.rnn import pad_sequence

# Fonction de padding pour DataLoader
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    labels = [item['label'] for item in batch]

    # input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    # attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
    # labels = torch.tensor(labels, dtype=torch.long)
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)


    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': labels
    }


In [103]:
from transformers import BertTokenizer

# Initialiser le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128


In [104]:
from transformers import BertModel
from torch.utils.data import DataLoader

# Créer les datasets et dataloaders
train_dataset = EventDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = EventDataset(val_texts, val_labels, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)


In [105]:
import torch.nn as nn

# Définir le modèle de classification
class EventClassifier(nn.Module):
    def __init__(self, n_classes):
        super(EventClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        output = self.drop(pooled_output)
        return self.out(output)

In [106]:
import torch.optim as optim

# Initialiser le modèle, la perte et l'optimiseur
model = EventClassifier(n_classes=len(label_encoder.classes_))
optimizer = optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()


In [107]:
import numpy as np

# Entraîner le modèle
def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

# Définir les appareils (GPU si disponible)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model = model.to(device)

# Entraîner et évaluer le modèle
EPOCHS = 10

for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_dataset)
    )

    val_acc, val_loss = eval_model(
        model,
        val_loader,
        loss_fn,
        device,
        len(val_dataset)
    )

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss {train_loss} accuracy {train_acc}')
    print(f'Val loss {val_loss} accuracy {val_acc}')

# Sauvegarder le modèle entraîné
torch.save(model.state_dict(), 'event_classifier_model.pth')

RuntimeError: stack expects each tensor to be equal size, but got [213] at entry 0 and [178] at entry 2