In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from roberta import Roberta
from oscar import Oscar

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# Initialize Oscar object
oscar = Oscar(language="fr", split="train")

# Define Roberta model
dictionary = list(oscar.get_vocab().keys())
model_name = "roberta-base"  # Replace with your model name
model = Roberta(model_name)

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.model.parameters(), lr=0.001)

# Learning rate scheduler
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

# Set device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.model.to(device)

# Define DataLoader for Oscar dataset
batch_size = 8  # Change this to whatever fits in our GPU
dataloader = DataLoader(range(len(oscar)), batch_size=batch_size, shuffle=True)

File Tokenization/oscar_text.txt found.


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Training loop
num_epochs = 1 # Change this too if we want to train for more epochs
best_loss = float('inf')
patience, trials = 10, 0
for epoch in range(num_epochs):
    total_loss = 0
    model.model.train()
    for batch_idx in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
       # Get batch data from the Oscar dataset
        print(batch_idx)
        inputs = [oscar.get_item(i) for i in batch_idx]
        inputs = torch.tensor(inputs).to(device)
        
        # Get the correct targets
        targets = torch.tensor([oscar.get_item(i) for i in batch_idx], dtype=torch.long).to(device)


        # Forward pass
        outputs = model.model(**inputs)

        # Flatten the logits and targets
        logits = outputs.logits.view(-1, outputs.logits.shape[-1])
        targets = targets.view(-1)

        # Calculate loss
        loss = criterion(logits, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    scheduler.step()

    # Print average loss for the epoch
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")

    # Early stopping
    if avg_loss < best_loss:
        trials = 0
        best_loss = avg_loss
        torch.save(model.model.state_dict(), "roberta_model.pth")
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break


Epoch 1/1:   0%|          | 0/24883 [00:00<?, ?it/s]


tensor([118393,  71608,  40390, 187390,  43236, 119729,  25278, 181854])


TypeError: len() of a 0-d tensor

In [24]:
batch_idx=[187445,  48310,  70665, 162565,   1991,  58635, 159944, 177223]
text = [oscar.get_item(i) for i in batch_idx]
print(text)

['<s>', '▁Haut', '▁en', '▁très', '▁bon', '▁état', '.', '▁Le', '▁col', '▁et', '▁le', '▁pli', '▁des', '▁manches', '▁sont', '▁en', '▁simili', 'cu', 'ir', '.', '▁Matière', '▁un', '▁peu', '▁transparente', '▁(3', 'ème', '▁photo', ')', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>'