In [2]:
import pandas as pd
import torch
from datasets import Dataset, load_dataset


# Load your dataset
dataset = load_dataset('manjuvallayil/factver_master', split='train', trust_remote_code=True)

df_pandas = pd.DataFrame(dataset)

# Function to handle None values in the aggregation
def join_evidence_texts(evidence_texts):
    return ' [SEP] '.join([text if text is not None else '' for text in evidence_texts])

# Group the evidence texts for each claim
grouped_df = df_pandas.groupby('Claim_text').agg({
    'Evidence_text': join_evidence_texts,
    'Label': 'first'
}).reset_index()

# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_pandas(grouped_df)
print(hf_dataset[:1])
# Clear CUDA cache
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

{'Claim_text': ['"Jake Fisher, senior director of auto testing at Consumer Reports, has stated that reliability issues with electric vehicles were expected due to most automakers, except early EV leader Tesla, only recently launching fully electric models."'], 'Evidence_text': ['Reliability issues with electric vehicles were expected, since most automakers, with the exception of early EV leader Tesla, launched fully electric models in recent years, said Jake Fisher, senior director of auto testing at Consumer Reports. [SEP] “By having all this new technology, there’s a lot of potential problems with them.” [SEP] Unlike all-electric vehicles, hybrid cars and trucks were among the most reliable in the study. That’s largely because many hybrids, such as the Toyota Prius, have been on the market for years, so automakers have been able to work out problems they’ve encountered. [SEP] However, Tesla owners continue to report problems with body hardware, paint and trim in their vehicles across

In [3]:
from transformers import AutoTokenizer

# Load the tokenizer for T5
tokenizer = AutoTokenizer.from_pretrained("t5-base")

# Add padding token if it's not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Tokenize the dataset
def tokenize_function(examples):
    # Ensure the inputs are treated as strings
    inputs = [claim + ' [SEP] ' + evidence for claim, evidence in zip(examples['Claim_text'], examples['Evidence_text'])]
    labels = ["true" if label == "T" else "false" for label in examples['Label']]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = tokenizer(labels, max_length=512, truncation=True, padding="max_length")["input_ids"]
    return model_inputs

tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Map:   0%|          | 0/590 [00:00<?, ? examples/s]

In [4]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test
train_test_split_ratio = 0.8
train_dataset, test_dataset = train_test_split(tokenized_dataset, test_size=(1 - train_test_split_ratio))

# Convert back to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(pd.DataFrame(train_dataset))
test_dataset = Dataset.from_pandas(pd.DataFrame(test_dataset))

print("Train Dataset:", train_dataset)
print("Test Dataset:", test_dataset)

Train Dataset: Dataset({
    features: ['Claim_text', 'Evidence_text', 'Label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 472
})
Test Dataset: Dataset({
    features: ['Claim_text', 'Evidence_text', 'Label', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 118
})


In [5]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

# Load the T5 model with gradient checkpointing enabled
model = T5ForConditionalGeneration.from_pretrained("t5-base")
model.gradient_checkpointing_enable()

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Clear any cached memory
torch.cuda.empty_cache()

# Define training arguments with further optimizations
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=1,  # Further reduced batch size
    per_device_eval_batch_size=1,   # Further reduced batch size
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,  # Enable mixed precision training
    gradient_accumulation_steps=32,  # Increased gradient accumulation steps
    max_grad_norm=1.0,
    max_steps=500,  # Limit the number of steps for quick testing
    dataloader_pin_memory=False,  # Disable dataloader pin memory to save memory
    dataloader_num_workers=2,  # Reduce the number of data loader workers to save memory
)

# Define a simple data collator that ensures correct padding
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8)

# Define a function to compute metrics
from datasets import load_metric

# Load the accuracy metric with trust_remote_code=True
metric = load_metric("accuracy", trust_remote_code=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Split the dataset into smaller subsets to avoid OOM errors
def train_in_subsets(train_dataset, subset_size=500):
    num_samples = len(train_dataset)
    for start_idx in range(0, num_samples, subset_size):
        end_idx = min(start_idx + subset_size, num_samples)
        subset = train_dataset.select(range(start_idx, end_idx))

        # Initialize the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=subset,
            eval_dataset=test_dataset,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics,
        )

        # Clear the cache before each training step
        torch.cuda.empty_cache()

        # Start training on the subset
        trainer.train()

# Start training on smaller subsets to manage memory
train_in_subsets(train_dataset)

  metric = load_metric("accuracy", trust_remote_code=True)


  0%|          | 0/500 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


{'loss': 16.0201, 'learning_rate': 4.91e-05, 'epoch': 0.68}


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/118 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


OutOfMemoryError: CUDA out of memory. Tried to allocate 5.82 GiB. GPU 0 has a total capacty of 15.70 GiB of which 5.72 GiB is free. Including non-PyTorch memory, this process has 9.95 GiB memory in use. Of the allocated memory 8.48 GiB is allocated by PyTorch, and 1.19 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF