In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import numpy as np

# Making a dataset

In [2]:
df = pd.read_csv("/kaggle/input/true-statements/true_statements.csv")
del df['answer']
ds = Dataset.from_pandas(df)
ds = ds.train_test_split(test_size=0.15)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'statement'],
        num_rows: 18659
    })
    test: Dataset({
        features: ['text', 'statement'],
        num_rows: 3293
    })
})

# Initializing model and tokenizer

In [3]:
model_checkpoint = "t5-small"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

# Setting the training args

In [4]:
def tokenize_function(examples):
    # Ensure text and statement are lists of strings
    texts = [str(text) for text in examples["text"]]
    statements = [str(statement) for statement in examples["statement"]]
    
    # Tokenize the inputs
    model_inputs = tokenizer(texts, max_length=256, truncation=True, padding="max_length")
    
    # Tokenize the targets
    labels = tokenizer(statements, max_length=128, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Apply the tokenization to your dataset
tokenized_ds = ds.map(tokenize_function, batched=True, remove_columns=["text", "statement"])

batch_size = 40
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-FT-false_statements",
    evaluation_strategy = "epoch",
    learning_rate=2e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=50,
    predict_with_generate=True,
    push_to_hub=False,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/18659 [00:00<?, ? examples/s]

Map:   0%|          | 0/3293 [00:00<?, ? examples/s]



In [5]:
!pip install wandb
import wandb
wandb.login(key='ENTER YOUR KEY')

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [6]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnewzns1710[0m ([33medu_oro[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.7
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240910_092122-i23b9o3q[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mt5-small-FT-false_statements[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/edu_oro/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/edu_oro/huggingface/runs/i23b9o3q[0m
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,No log,0.190359
2,No log,0.189692
3,0.268100,0.189477
4,0.268100,0.196642
5,0.154100,0.205302
6,0.154100,0.213593
7,0.124600,0.21446
8,0.124600,0.222297
9,0.099200,0.224616
10,0.099200,0.239578


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(devic

TrainOutput(global_step=11700, training_loss=0.04803648314924321, metrics={'train_runtime': 14245.6162, 'train_samples_per_second': 65.49, 'train_steps_per_second': 0.821, 'total_flos': 6.31335668416512e+16, 'train_loss': 0.04803648314924321, 'epoch': 50.0})