# Définition du modèle

Cette fois ci, on entraine tout et pas seulement pas la couche de classification.

In [1]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn

# Charger le modèle pré-entraîné
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Modèle de classification avec une couche fully-connected
class SentenceClassificationModel(nn.Module):
    def __init__(self, base_model, embedding_dim):
        super(SentenceClassificationModel, self).__init__()
        self.base_model = base_model
        self.fc = nn.Linear(embedding_dim, 2)  # Deux classes en sortie

    def forward(self, texts):
        embeddings = self.base_model.encode(texts, convert_to_tensor=True).to(device)
        return self.fc(embeddings)  # CrossEntropyLoss gère Softmax

# Obtenir la dimension des embeddings
embedding_dim = model.get_sentence_embedding_dimension()

# Initialiser le modèle personnalisé
custom_model = SentenceClassificationModel(model, embedding_dim)

# Passer sur GPU si disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)


  from tqdm.autonotebook import tqdm, trange
2024-10-22 13:02:56.170483: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 13:02:56.199388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 13:02:56.232248: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 13:02:56.242152: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 13:02:56.267269

In [2]:
def count_trainable_parameters(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Nombre de paramètres entraînables : {trainable_params}")

count_trainable_parameters(custom_model)

Nombre de paramètres entraînables : 22713986


# Définition du datset

In [3]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

# Charger le dataset Allociné
dataset = load_dataset("tblard/allocine")

# Redéfinir les ensembles train, validation, test
dataset = {
    'train': dataset['train'],
    'validation': dataset['validation'],
    'test': dataset['test']
}

# Classe Dataset pour PyTorch
class AllocineDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Récupérer texte et label
        text = self.dataset[idx]['review']
        label = self.dataset[idx]['label']
        return text, torch.tensor(label, dtype=torch.long)

# Créer les datasets
train_dataset = AllocineDataset(dataset['train'])
val_dataset = AllocineDataset(dataset['validation'])
test_dataset = AllocineDataset(dataset['test'])

# Afficher la taille des datasets
print(f"Taille train : {len(train_dataset)}")
print(f"Taille validation : {len(val_dataset)}")
print(f"Taille test : {len(test_dataset)}")

# Créer les DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

Taille du dataset d'entraînement : 160000
Taille du dataset de validation : 20000
Taille du dataset de test : 20000


# Boucle d'entrainement et validation

In [4]:
from tqdm import tqdm
import torch.optim as optim

# Définir la fonction de perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(custom_model.parameters(), lr=1e-3)

# Fonction d'entraînement avec visualisation et validation
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        print(f"Epoch {epoch+1}/{num_epochs}:")

        # Boucle d'entraînement
        for inputs, labels in tqdm(train_loader, desc="Training", leave=False):
            # Déplacer les labels sur le GPU
            labels = labels.to(device)

            # Réinitialiser les gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # Calculer la perte
            loss = criterion(outputs, labels)

            # Backward pass et optimisation
            loss.backward()
            optimizer.step()

            # Accumuler la perte
            running_loss += loss.item()

        # Afficher la perte moyenne pour l'époque
        avg_train_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs} completed. Training Loss: {avg_train_loss:.4f}')

        # Validation après chaque époque
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc="Validating", leave=False):
                # Déplacer les inputs sur le GPU
                labels = labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                # Calcul de l'accuracy
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        accuracy = 100 * correct / total
        print(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%\n')


In [5]:
# Fonction d'évaluation du modèle
def evaluate(model, test_loader):
    model.eval()  # Mode évaluation
    correct = 0
    total = 0
    print("Evaluating model on test data...")
    
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating", leave=False):
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Calculer l'accuracy
    accuracy = 100 * correct / total
    print(f'Accuracy on test data: {accuracy:.2f}%\n')
    return accuracy

# Entrainement

In [6]:
# Entraîner le modèle
print("Starting training...")
train(custom_model, train_loader, val_loader, criterion, optimizer, num_epochs=5)

# Évaluer le modèle sur les données de test
print("Training complete. Starting evaluation...")
evaluate(custom_model, test_loader)

Starting training...
Epoch 1/5:


                                                             

Epoch 1/5 completed. Training Loss: 0.5749


                                                             

Validation Loss: 0.5323, Accuracy: 73.91%

Epoch 2/5:


                                                             

Epoch 2/5 completed. Training Loss: 0.5320


                                                             

Validation Loss: 0.5170, Accuracy: 74.69%

Epoch 3/5:


                                                             

Epoch 3/5 completed. Training Loss: 0.5222


                                                             

Validation Loss: 0.5100, Accuracy: 74.93%

Epoch 4/5:


                                                             

Epoch 4/5 completed. Training Loss: 0.5173


                                                             

Validation Loss: 0.5059, Accuracy: 75.22%

Epoch 5/5:


                                                             

Epoch 5/5 completed. Training Loss: 0.5142


                                                             

Validation Loss: 0.5048, Accuracy: 75.19%

Training complete. Starting evaluation...
Evaluating model on test data...


                                                             

Accuracy on test data: 75.06%





75.06