In [1]:
import torch
import torch.nn as nn

# Example tensors
predicted_output = torch.tensor([3.0, 4.0]) # float32 tensor
target = torch.tensor([1, 2]) # int64 tensor

# Loss function
criterion = nn.BCEWithLogitsLoss()

# This will raise the error
loss = criterion(predicted_output, target)

RuntimeError: result type Float can't be cast to the desired output type Long

In [2]:
# Example tensors
predicted_output = torch.tensor([3.0, 4.0]) # float32 tensor
target = torch.tensor([1, 2]) # int64 tensor

# Convert target to float32
target = target.float()


# Loss function
criterion = nn.BCEWithLogitsLoss()

# This will raise the error
loss = criterion(predicted_output, target)

In [4]:
# Example tensors
predicted_output = torch.tensor([3.0, 4.0])  # float32 tensor (logits)
target = torch.tensor([1, 0], dtype=torch.float32)  # float32 tensor (binary labels)

# Loss function
criterion = nn.BCEWithLogitsLoss()

# This will work now
loss = criterion(predicted_output, target)
print(f"Loss: {loss.item()}")

Loss: 2.0333685874938965


In [5]:
# Example of batch with 2 samples and 4 classes
predicted_output = torch.tensor([[1.2, -0.5, 2.0, 0.1], 
                                 [-1.0, 2.5, -0.8, 1.1]])  # logits for 4 classes (shape: [2, 4])

target = torch.tensor([[1, 0, 1, 0],  # 1st sample (labels for 4 classes)
                       [0, 1, 0, 1]], dtype=torch.float32)  # 2nd sample (labels for 4 classes)

# Loss function
criterion = nn.BCEWithLogitsLoss()

# Calculate loss
loss = criterion(predicted_output, target)
print(f"Loss: {loss.item()}")

Loss: 0.33240896463394165


In [7]:
from transformers import AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset
train_dataset = load_dataset('multi_eurlex', 'en',split='train')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the dataset
train_dataset = train_dataset.map(lambda x: tokenizer(x['text'], padding=True, truncation=True, max_length=512), batched=True)

# Set the format
train_dataset.set_format(
    type="torch", 
    columns=["input_ids", "attention_mask", "labels"], 
    dtype=torch.int64
)

# Define the model (e.g., XLM-Roberta)
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=50)  # Example number of labels

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


Generating train split: 100%|██████████| 55000/55000 [01:24<00:00, 647.69 examples/s] 
Generating test split: 100%|██████████| 5000/5000 [00:48<00:00, 103.81 examples/s]
Generating validation split: 100%|██████████| 5000/5000 [00:50<00:00, 99.61 examples/s] 
Map: 100%|██████████| 55000/55000 [00:44<00:00, 1236.77 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


MissingConfigException: Yaml file '/home/onyxia/work/NLP-Legal-document-classification/mlruns/0/meta.yaml' does not exist.

In [8]:
from transformers import Trainer, TrainingArguments

# Assuming model, tokenizer, and dataset are already defined

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    # Disable MLflow integration
    disable_tqdm=False,  # to keep progress bar if desired
    report_to=[]  # Disable reporting to MLflow
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


  trainer = Trainer(


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:
from transformers import Trainer, TrainingArguments
import torch

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize the data (ensure padding and truncation)
def tokenize_function(examples):
    return tokenizer(
        examples['text'], 
        padding=True,  # Ensure padding
        truncation=True,  # Ensure truncation
        max_length=512   # Set max length if needed
    )

# Tokenize the dataset
train_dataset = train_dataset.map(tokenize_function, batched=True)

# Ensure labels are in the correct format for multi-label classification
def preprocess_labels(examples):
    labels = examples['labels']
    labels = torch.tensor(labels, dtype=torch.float32)  # Convert labels to float32
    return {'labels': labels}

train_dataset = train_dataset.map(preprocess_labels, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    evaluation_strategy="steps",  # You can set this according to your needs
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import f1_score

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

# Model
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels, problem_type="multi_label_classification")

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Custom DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=8)
test_dataloader = DataLoader(test_datasets["fr"], batch_size=8)

# F1 metric function
def compute_metrics(predictions, labels):
    preds = (predictions > 0.5).int()
    return f1_score(labels, preds, average='micro')

# Training Loop
for epoch in range(5):  # Number of epochs
    model.train()
    for batch in train_dataloader:
        # Move to GPU if available
        batch = {key: value.to(device) for key, value in batch.items()}
        
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} completed")

# Evaluation Loop
model.eval()
for batch in test_dataloader:
    batch = {key: value.to(device) for key, value in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        logits = outputs.logits
        labels = batch['labels']
        f1 = compute_metrics(logits, labels)
        print(f"F1 Score: {f1}")