# Définition du modèle avec SentenceTransformers

In [1]:
from sentence_transformers import SentenceTransformer
import torch
import torch.nn as nn

# Charger le modèle pré-entraîné
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Geler les poids du modèle pré-entraîné
for param in model.parameters():
    param.requires_grad = False

class SentenceClassificationModel(nn.Module):
    def __init__(self, base_model):
        super(SentenceClassificationModel, self).__init__()
        self.base_model = base_model
        self.fc = nn.Linear(384, 2)

    def forward(self, texts):
        embeddings = self.base_model.encode(texts, convert_to_tensor=True).to(device)
        x = self.fc(embeddings)
        return x

# Initialiser le modèle de classification personnalisé
custom_model = SentenceClassificationModel(model)

# Passer le modèle sur GPU (si possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
custom_model = custom_model.to(device)

  from tqdm.autonotebook import tqdm, trange
2024-10-22 14:21:25.070533: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-22 14:21:25.089805: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 14:21:25.112124: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 14:21:25.118750: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 14:21:25.136345

In [2]:
print(custom_model)

SentenceClassificationModel(
  (base_model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (fc): Linear(in_features=384, out_features=2, bias=True)
)


# Chargement du dataset

In [3]:
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset

# Charger le dataset IMDb
dataset = load_dataset("imdb")

# Fraction de l'ensemble d'entraînement à réserver pour la validation
train_valid_split = dataset['train'].train_test_split(test_size=0.2)

# Redéfinir le dataset avec le nouvel ensemble de validation
dataset = {
    'train': train_valid_split['train'],
    'validation': train_valid_split['test'],
    'test': dataset['test']
}

# Définir une classe Dataset pour PyTorch
class IMDbDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        # Récupérer le texte et le label
        text = self.dataset[idx]['text']
        label = self.dataset[idx]['label']
        
        # Retourner le texte brut et le label
        return text, torch.tensor(label, dtype=torch.long)

# Créer les datasets d'entraînement, de validation et de test
train_dataset = IMDbDataset(dataset['train'])
val_dataset = IMDbDataset(dataset['validation'])
test_dataset = IMDbDataset(dataset['test'])

# Taille des datasets
print(f"Taille du dataset d'entraînement : {len(train_dataset)}")
print(f"Taille du dataset de validation : {len(val_dataset)}")
print(f"Taille du dataset de test : {len(test_dataset)}")

# Créer les DataLoader pour itérer sur les données
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

Taille du dataset d'entraînement : 20000
Taille du dataset de validation : 5000
Taille du dataset de test : 25000


# Entrainement 

In [4]:
from tqdm import tqdm
import torch.optim as optim

# Définir la fonction de perte et l'optimiseur
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(custom_model.parameters(), lr=1e-3)

# Fonction d'entraînement du modèle avec visualisation et validation
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        # Mode entraînement
        model.train()
        running_loss = 0.0
        print(f"Epoch {epoch+1}/{num_epochs}:")

        # Boucle d'entraînement
        for inputs, labels in tqdm(train_loader, desc="Training", leave=False):
            labels = labels.to(device)
            # Réinitialiser les gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            
            # Calculer la perte
            loss = criterion(outputs, labels)
            
            # Backward pass et optimisation
            loss.backward()
            optimizer.step()

            # Accumuler la perte
            running_loss += loss.item()

        # Afficher la perte moyenne pour l'époque
        avg_train_loss = running_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{num_epochs} completed. Training Loss: {avg_train_loss:.4f}')

        # Validation après chaque époque
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in tqdm(val_loader, desc="Validating", leave=False):
                labels = labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()

                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        accuracy = 100 * correct / total
        print(f'Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%\n')

In [5]:
# Fonction d'évaluation du modèle avec visualisation
def evaluate(model, test_loader):
    model.eval()  # Mettre le modèle en mode évaluation
    correct = 0
    total = 0
    print("Evaluating model on test data...")
    with torch.no_grad():
        for inputs, labels in tqdm(test_loader, desc="Evaluating", leave=False):
            labels = labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy on test data: {accuracy:.2f}%\n')
    return accuracy

In [6]:
# Entraîner le modèle
print("Starting training...")
train(custom_model, train_loader, val_loader, criterion, optimizer, num_epochs=5)

# Évaluer le modèle sur les données de test
print("Training complete. Starting evaluation...")
evaluate(custom_model, test_loader)

Starting training...
Epoch 1/5:


                                                           

Epoch 1/5 completed. Training Loss: 0.5975


                                                             

Validation Loss: 0.5428, Accuracy: 76.02%

Epoch 2/5:


                                                           

Epoch 2/5 completed. Training Loss: 0.5085


                                                             

Validation Loss: 0.4968, Accuracy: 76.70%

Epoch 3/5:


                                                           

Epoch 3/5 completed. Training Loss: 0.4741


                                                             

Validation Loss: 0.4761, Accuracy: 77.70%

Epoch 4/5:


                                                           

Epoch 4/5 completed. Training Loss: 0.4556


                                                             

Validation Loss: 0.4633, Accuracy: 78.42%

Epoch 5/5:


                                                           

Epoch 5/5 completed. Training Loss: 0.4440


                                                             

Validation Loss: 0.4555, Accuracy: 78.52%

Training complete. Starting evaluation...
Evaluating model on test data...


                                                             

Accuracy on test data: 79.35%





79.348