In [2]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from roberta import Roberta
from oscar import Oscar
from model import Transformer

In [3]:
# Initialize Oscar object
oscar = Oscar(language="fr", split="train")

# Define Roberta model
dictionary = list(oscar.get_vocab().keys())
# model_name = "roberta-base"  
# model = Roberta(model_name) # si on utilise Roberta : faire un controle h et remplacer tous les models.parameters() par model.model.parameters()
model = Transformer(dictionary) # si on utilise model : faire un controle h et remplacer tous les models.models.parameters() par model.parameters()

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

# Set device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define DataLoader for Oscar dataset
batch_size = 32  # Change this to whatever fits in our GPU
dataloader = DataLoader(range(len(oscar)), batch_size=batch_size, shuffle=True)

Found cached dataset oscar-mini (C:/Users/33631/.cache/huggingface/datasets/nthngdy___oscar-mini/unshuffled_deduplicated_fr-language=fr/0.0.0/d61b181331745a38dd31e8c6cc23d46566b96e255384c4421f2396af24a01dff)


File Tokenization/oscar_text.txt found.


In [4]:
# Training loop
num_epochs = 1 # Change this too if we want to train for more epochs
best_loss = float('inf')
patience, trials = 10, 0
for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for batch_idx in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
       # Get batch data from the Oscar dataset
        # Convert batch_idx to list of inputs
        batch_idx = batch_idx.tolist()
        inputs = [oscar.get_masked_item(i) for i in batch_idx]
        inputs = torch.tensor(inputs).to(device)
        
        # Get the correct targets
        targets = [oscar[i] for i in batch_idx]


        # Forward pass
        outputs = model(inputs)
        
        # Calculate loss
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()

    # Print average loss for the epoch
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")

    # Early stopping
    if avg_loss < best_loss:
        trials = 0
        best_loss = avg_loss
        torch.save(model.state_dict(), "roberta_model.pth")
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break


Epoch 1/1:   0%|          | 0/6221 [00:07<?, ?it/s]


IndexError: index out of range in self

In [7]:
batch_idx=[187445,  48310,  70665, 162565,   1991,  58635, 159944, 177223]
text = [oscar[i] for i in batch_idx]
print(text)

[[1, 4, 10502, 677, 28, 7488, 5, 4, 99, 4, 132, 14146, 9616, 6236, 744, 5, 4, 26, 1582, 4, 4, 29, 17, 4, 19, 4, 10, 6, 15, 9, 8644, 709, 59, 4, 4, 3142, 55, 11467, 5, 396, 125, 5, 53, 9, 41, 14774, 359, 34, 87, 56, 4, 4393, 1646, 6, 12466, 30, 109, 4, 34, 30044, 20, 4263, 6, 115, 5709, 34, 462, 13, 11, 1300, 6102, 6, 428, 10, 6, 18, 5709, 4, 462, 13, 414, 4, 506, 709, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 28842, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 9798, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 31487, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 4, 3, 