**imports**

In [1]:
import os
import time
import torch
import pandas as pd
#from tqdm import tqdm #cool progress bar - not used in Kaggle for log vis, kept for reference
from torch.utils.data import Dataset, DataLoader
import transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer

2025-05-05 06:10:01.943504: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746425402.141672      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746425402.209720      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Data Preprocessing/Model Preparation

In [2]:
#set up device & load pre-trained models/tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #ensure GPU is working/good to go
model = T5ForConditionalGeneration.from_pretrained("t5-small") #T5-small: T5 good for this task, small = less intensive to train
model.to(device) #move model to appropriate device (GPU)
tokenizer = T5Tokenizer.from_pretrained("t5-small") #correct tokenizer for T5
assert tokenizer.pad_token_id is not None, "Tokenizer is missing pad_token_id" #ensure tokenizer has padding token (it should, but just to be safe)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
#set hyperparams
MAX_LENGTH = 85 #perfect size to fit all train/valid/test inputs - checked to make sure
BATCH_SIZE = 16 #good for convergence speed while avoiding memory issues on most GPUs (running in Kaggle)
LEARNING_RATE = 3e-5 #standard for fine-tuning T5 models
WEIGHT_DECAY = 0.01 #moderate regularization - prevents overfitting
EPOCHS = 3 #balanced number of epochs to avoid overfitting and underfitting - no early stopping needed bc of how low it is though
MAX_NORM = 1.0 #ensures gradients don't get too large
RANDOM = 213 #random state for reproducibility (I like using this num)

In [4]:
#randomness
torch.manual_seed(RANDOM) #remove for non-deterministic
transformers.set_seed(RANDOM) #transformers specific random seed
torch.backends.cudnn.deterministic = True #ensures GPU operations are deterministic (only for training)
torch.backends.cudnn.benchmark = False #disables auto-tuning of algorithms (ensures more reproducibility)

In [5]:
#put data in custom dataset so it works with DataLoader (won't work with anything without __len__ and __getitem__)
class ThoughtReframingDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe.reset_index(drop=False)  #keeps original indices as a column
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_point = self.data.iloc[idx]
        #format prompt properly - only input and output to test how it affects generation
        input_text = f"reframe this thought: {data_point['negative_thought']}"
        target_text = data_point["reframed_thought"]
        index = data_point["index"] #get preserved index

        #tokenize input/target, truncate to be safe (won't go over max length) pad with max length, return PyTorch tensor (for better performance)
        input_tokens = self.tokenizer(input_text, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt")
        target_tokens = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=MAX_LENGTH, return_tensors="pt")

        target_input_ids = target_tokens["input_ids"].squeeze(0) #get target input ids
        mask = (target_input_ids == self.tokenizer.pad_token_id).to(device) #init mask and move to device
        labels = target_input_ids.to(device).masked_fill(mask, -100) #apply mask to labels
        
        return {
            #squeeze(0) ensures first dim removed only if more than one dim (should always be, but good practice)
            "input_ids": input_tokens["input_ids"].squeeze(0).to(device), #explicitly moving to device (should go to device automatically, but just to be safe)
            "attention_mask": input_tokens["attention_mask"].squeeze(0).to(device), #uses attention mechanism - needed for T5
            #set to -100 to exclude from loss computation - need to ensure decoded version for evaluation
            "labels": labels,
            "index": index
        }

In [6]:
#load raw dataset needed for training
train_df = pd.read_csv("./data/train_data.csv")
valid_df = pd.read_csv("./data/valid_data.csv")

#create datasets for training
train_dataset = ThoughtReframingDataset(train_df, tokenizer)
valid_dataset = ThoughtReframingDataset(valid_df, tokenizer)

#use DataLoader to automatically make batches for data (plus it works well with PyTorch and Transformer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [7]:
#set up training - optimizer = Adam + correct weight decay (regularization) - better generalization and convergence
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

In [8]:
#make directory to save checkpoints
checkpoint_dir = "./checkpoints/t5"
os.makedirs(checkpoint_dir, exist_ok=True)

#make directory to save final model
final_dir = "./final/t5"
os.makedirs(final_dir, exist_ok=True)

#make directory to save final data
model_outputs_dir = "./model_outputs"
os.makedirs(model_outputs_dir, exist_ok=True)

In [9]:
#save tokenizer to final directory (only need to save once, better to save before training just in case)
tokenizer.save_pretrained(final_dir)

('./final/tokenizer_config.json',
 './final/special_tokens_map.json',
 './final/spiece.model',
 './final/added_tokens.json')

# Model Training

In [10]:
#train time
for epoch in range(EPOCHS):
    epoch_start = time.time() #how long whole epoch takes
    train_start = time.time() #how long training takes
    model.train() #put model in training mode
    total_loss = 0 #init loss

    #commented out progress bar bc, while cool, gets in the way of viewing Kaggle logs easily
    for batch in train_loader: #tqdm(train_loader, desc=f"Epoch {epoch + 1} Training"):
        #get necessary data from batch
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        #clear optimized tensor gradients (they accumulate in backward pass, stops them from interfering with new ones)
        optimizer.zero_grad()
        
        #forward pass (yippee!!!) - attention mask auto-ignores any unimportant tokens
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss #loss
        loss.backward() #backpropagation (woohoo!)
        #gradient clipping after backprop to stop exploding gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=MAX_NORM)
        optimizer.step() #update weights
        total_loss += loss.item() #update loss in epoch

    print(f"Epoch {epoch + 1} Training Loss: {total_loss / len(train_loader):.4f}")
    train_end = time.time()

    #validation time
    valid_start = time.time() #how long validation takes
    model.eval() #put model in evaluate mode (predict) for validation dataset
    valid_loss = 0 #init loss
    with torch.no_grad(): #disable gradient computation (not needed for evaluation of model)
        #same as train but without backpropagate parts
        for batch in valid_loader: #tqdm(valid_loader, desc=f"Epoch {epoch + 1} Validation"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            valid_loss += outputs.loss.item()

    print(f"Epoch {epoch + 1} Validation Loss: {valid_loss / len(valid_loader):.4f}")
    valid_end = time.time()
    epoch_end = time.time()
    
    #save checkpoint of model (in case it stops/runs out of compute time partway through training)
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "loss": loss.item()
    }, os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch+1}.pt"))

    print(f"Epoch {epoch + 1} summary:")
    print(f"  Training time: {train_end - train_start:.2f} seconds")
    print(f"  Validation time: {valid_end - valid_start:.2f} seconds")
    print(f"  Total epoch time: {epoch_end - epoch_start:.2f} seconds\n")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 Training Loss: 2.2551
Epoch 1 Validation Loss: 2.0148
Epoch 1 summary:
  Training time: 148.28 seconds
  Validation time: 15.46 seconds
  Total epoch time: 163.74 seconds

Epoch 2 Training Loss: 2.1324
Epoch 2 Validation Loss: 1.9743
Epoch 2 summary:
  Training time: 147.23 seconds
  Validation time: 15.49 seconds
  Total epoch time: 162.73 seconds

Epoch 3 Training Loss: 2.0882
Epoch 3 Validation Loss: 1.9526
Epoch 3 summary:
  Training time: 147.70 seconds
  Validation time: 15.46 seconds
  Total epoch time: 163.16 seconds



In [11]:
#save final model after training has finished
model.save_pretrained(final_dir)

# Model Generation

In [12]:
#open/prepare testing dataset
test_df = pd.read_csv("./data/test_data.csv")

test_dataset = ThoughtReframingDataset(test_df, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [13]:
model.eval() #model in eval mode for predict again
#init - saving both just in case any discrepancies caused by T5 tokenizer (prob none, but just to make sure)
generated_texts = []
true_texts = []
indices = [] #store index for comp later

test_start = time.time() #how long testing takes
with torch.no_grad(): #yet again no gradients computed for predict
    for batch in test_loader: #tqdm(test_loader, desc="Testing"):
        #get data
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].clone().to(device)
        labels[labels == -100] = tokenizer.pad_token_id #revert pad token ids (to not mess up the decoding process) 
        batch_indices = batch["index"]
        
        #generate predictions based on input 
        generated_ids = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=MAX_LENGTH)
        #decode predicted and true outputs into readable text
        generated = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        true = tokenizer.batch_decode(labels, skip_special_tokens=True)

        generated_texts.extend(generated)
        true_texts.extend(true)
        indices.extend(batch_indices)

test_end = time.time()
print(f"Testing time: {test_end - test_start:.2f} seconds")

Testing time: 74.79 seconds


In [14]:
#save outputs to dataframe
output_df = pd.DataFrame({
    "Original_Index": indices,
    "True_Text": true_texts,
    "Generated_Text": generated_texts
})
#sort by index to restore original csv order first
output_df = output_df.sort_values(by="Original_Index").reset_index(drop=True)
output_df.drop(columns=["Original_Index"], inplace=True) #not needed after sort
output_df.to_csv(model_outputs_dir + "/generated_output_t5.csv", index=False)