In [None]:
#!pip install transformers
#!pip install sentencepiece

In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_error, accuracy_score
import numpy as np

In [None]:
import pandas as pd

dataset = pd.read_csv('/content/Trial-Dataset.csv')

In [None]:
class PlayerInputOutputDataset(Dataset):
    def __init__(self, data, tokenizer, source_max_token_len=128, target_max_token_len=128):
        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row['Player_Input'],
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            data_row['Output'],
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            input_ids=source_encoding['input_ids'].flatten(),
            attention_mask=source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
        )


In [None]:
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)

train_df, val_df = train_test_split(dataset, test_size=0.1)

train_dataset = PlayerInputOutputDataset(train_df, tokenizer)
val_dataset = PlayerInputOutputDataset(val_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
num_epochs = 20
learning_rate = 3e-4
adam_epsilon = 1e-8
total_steps = len(train_loader) * num_epochs
model_path = "/content/t5_fine_tuned"

early_stopping_patience = 3
early_stopping_counter = 0
best_val_loss = float('inf')

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(num_epochs):
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print('-' * 10)

    model.train()
    train_losses = []
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        train_losses.append(loss.item())

    train_loss = np.mean(train_losses)
    print(f'Train loss {train_loss}')

    model.eval()
    val_losses = []
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        val_losses.append(loss.item())

    val_loss = np.mean(val_losses)
    print(f'Validation loss {val_loss}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print('Early stopping triggered.')
            break




Epoch 1/20
----------
Train loss 4.783982515335083
Validation loss 3.2447972297668457
Epoch 2/20
----------
Train loss 3.5266423333774912
Validation loss 2.7534426053365073
Epoch 3/20
----------
Train loss 3.0623910535465586
Validation loss 2.474454085032145
Epoch 4/20
----------
Train loss 2.7003535249016504
Validation loss 2.1779896020889282
Epoch 5/20
----------
Train loss 2.408356612378901
Validation loss 1.9691200256347656
Epoch 6/20
----------
Train loss 2.1932009133425625
Validation loss 1.7815261681874592
Epoch 7/20
----------
Train loss 1.9958446513522754
Validation loss 1.6047706604003906
Epoch 8/20
----------
Train loss 1.8286893313581294
Validation loss 1.4497205813725789
Epoch 9/20
----------
Train loss 1.675434486432509
Validation loss 1.3368101914723713
Epoch 10/20
----------
Train loss 1.539536096832969
Validation loss 1.2284695307413738
Epoch 11/20
----------
Train loss 1.4320017261938616
Validation loss 1.1178052425384521
Epoch 12/20
----------
Train loss 1.3361592482

In [None]:

model_path = "/content/t5_fine_tuned"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/t5_fine_tuned/tokenizer_config.json',
 '/content/t5_fine_tuned/special_tokens_map.json',
 '/content/t5_fine_tuned/spiece.model',
 '/content/t5_fine_tuned/added_tokens.json')

In [None]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = T5ForConditionalGeneration.from_pretrained('/content/t5_fine_tuned').to(device)
tokenizer = T5Tokenizer.from_pretrained(model_name)

input_texts = ["order a coffee", "tell a joke", "jump around", "turn off the alarm", "Hey Monica, How are you?"]

for input_text in input_texts:
    input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)
    outputs = model.generate(input_ids)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {input_text}\nOutput: {generated_text}\n")



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Input: order a coffee
Output: Chandler's eyes light up as he orders a cup of the day's special

Input: tell a joke
Output: Chandler chuckles at the news, a joke that's a joke about nature

Input: jump around
Output: "Chandler's face shows the strain of a swimming

Input: turn off the alarm
Output: "Chandler unwraps the alarm, prompting an alarm bell." Monica:

Input: Hey Monica, How are you?
Output: "Monica:"Oh, how are you?"

