In [1]:
import torch 
from torch import nn 
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import StepLR
from tqdm import tqdm
from oscar import Oscar
from model import Transformer
import numpy as np
from matplotlib import pyplot as plt
import random as rd
from torch.optim import Adam

In [7]:
# Initialize Oscar object
oscar = Oscar(language="fr", split="train", max_length=200)

# Define Roberta model
dictionary = list(oscar.get_vocab().keys())
# model_name = "roberta-base"  
# model = Roberta(model_name) # si on utilise Roberta : faire un controle h et remplacer tous les models.parameters() par model.model.parameters()
model = Transformer(dictionary,max_seq_len=200) # si on utilise model : faire un controle h et remplacer tous les models.models.parameters() par model.parameters()

# Define your loss function and optimizer
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.0001)



# Set device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion.to(device)

indices = list(range(0, len(oscar), 10))  # This will take every second element

# Create the subset for a smaller dataset
subset = torch.utils.data.Subset(oscar, indices)

# Define DataLoader for Oscar dataset
batch_size = 128 # Change this to whatever fits in our GPU
dataloader = DataLoader(oscar, batch_size=batch_size, shuffle=True, num_workers=6)
#dataloader = DataLoader(subset, batch_size=batch_size, shuffle=True, num_workers=6)

Found cached dataset oscar-mini (C:/Users/33631/.cache/huggingface/datasets/nthngdy___oscar-mini/unshuffled_deduplicated_fr-language=fr/0.0.0/d61b181331745a38dd31e8c6cc23d46566b96e255384c4421f2396af24a01dff)


File Tokenization/oscar_text.txt found.


In [None]:

# Define the optimizer
#optimizer = Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01)
optimizer = Adam(model.parameters(), lr=0.0001)

# Define the learning rate scheduler
#scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10000, num_training_steps=1000000)
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# Training loop
num_epochs = 5 # Change this too if we want to train for more epochs
best_loss = float('inf')
patience, trials = 10, 0

losses = []
total_losses = []

model.train()  # Ensure the model is in training mode
for epoch in range(num_epochs):
    total_loss = 0
    optimizer.zero_grad()  # Reset gradients at the beginning of each epoch
    for batch_idx, (inputs, targets) in enumerate(tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")):
        inputs, targets = inputs.to(device), targets.to(device)

        attn_mask = torch.full([model.max_seq_len, len(inputs)], -np.inf)
        attn_mask = torch.triu(attn_mask, diagonal=1).to(device)

        # Forward pass
        outputs = model(inputs, targets, attn_mask)

        # Calculate loss
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets.view(-1))
        loss.backward()  # Backward pass

        # Perform optimization step only after accumulating gradients for accumulation_steps batches
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        losses.append(loss.item())

    total_losses.append(total_loss)
    scheduler.step()

    # Save the loss plot to the 'loss' directory
    plt.figure()
    plt.plot(losses)
    plt.show()
    
    # Print average loss for the epoch
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss}")

    # Early stopping
    if avg_loss < best_loss:
        trials = 0
        best_loss = avg_loss
        torch.save(model.state_dict(), "train_camembert.pth")
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break

In [None]:
plt.figure()
plt.plot(losses)
plt.show()
print(losses[-1])

In [None]:
# Load the weights from the .pth file
weights1 = torch.load('train_camembert.pth')

In [None]:
#model = Transformer(dictionary, num_layers=6) #pour camembert et camembert2
model = Transformer(dictionary, num_layers=6, d_model=768, num_heads=12) #pour camembert3 et camembert4_lr10-3
# Apply these weights to the model
model.load_state_dict(weights1)

In [None]:
model.eval()

# Set device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict(sentence):
    output = torch.Tensor([rd.randint(3,32000)])
    output = torch.Tensor([rd.randint(0,32000) for _ in range(200)])
    output = torch.Tensor([1])
    for i in range(6, model.max_seq_len):
        #print(oscar.ids_to_tokens(sentence))
        sentence = torch.Tensor(sentence)
        sentence, output = sentence.long(), output.long()
        sentence, output = sentence.to(device), output.to(device)
        predicted = model(sentence, output, None)
        
        predicted = predicted[i]
        output = torch.argmax(predicted)
        idx = (sentence == 4).nonzero(as_tuple=True)[0][0]
        if idx == 49 : 
            sentence[idx] = 2
            break
        sentence[idx] = output
        sentence[idx+1] = 4
    return sentence

sentence = "je vais acheter du <mask>"
a = predict(oscar.tokenize_text(sentence))
print(oscar.ids_to_tokens(a.tolist()))

#    for i in range(200):
#         predictions = model(sentence, output)
#        next_token_prob_distribution = predictions[i]
#        next_token_index = torch.argmax(next_token_prob_distribution).item()
#        sentence = (sentence[0][:-1] + next_token_index, )
#        sentence = (sentence[0] + 2, )
#        if next_token_index == 2:
#           break
#    return sentence[0]

In [7]:
batch_idx=[187445,  48310,  70665, 162565,   1991,  58635, 159944, 177223]
text = [oscar[i] for i in batch_idx]
print(text)

[[1, 4, 10502, 677, 28, 7488, 5, 4, 99, 4, 132, 14146, 9616, 6236, 744, 5, 4, 26, 1582, 4, 4, 29, 17, 4, 19, 4, 10, 6, 15, 9, 8644, 709, 59, 4, 4, 3142, 55, 11467, 5, 396, 125, 5, 53, 9, 41, 14774, 359, 34, 87, 56, 4, 4393, 1646, 6, 12466, 30, 109, 4, 34, 30044, 20, 4263, 6, 115, 5709, 34, 462, 13, 11, 1300, 6102, 6, 428, 10, 6, 18, 5709, 4, 462, 13, 414, 4, 506, 709, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 28842, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 9798, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 4, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 31487, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 3, 3, 4, 3, 

In [None]:
# Load the weights from the .pth file
weights = torch.load('camembert.pth')

# Apply these weights to the model
model.load_state_dict(weights)

In [None]:
model.eval()

# Set device (CPU/GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict(sentence):
    sentence = torch.Tensor(sentence)
    output = torch.Tensor([1])
    sentence, output = sentence.long(), output.long()
    sentence, output = sentence.to(device), output.to(device)
    for i in range(200):
        predictions = model(sentence, output)
        next_token_prob_distribution = predictions[i]
        next_token_index = torch.argmax(next_token_prob_distribution).item()
        sentence = (sentence[0][:-1] + next_token_index, )
        sentence = (sentence[0] + 2, )
        if next_token_index == 2:
          break
    return sentence[0]

In [None]:
sentence = "je vais acheter du <mask>"
predict(oscar.tokens_to_ids(oscar.tokenize_text(sentence)))