In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/output.csv")

In [None]:
df.head()

Unnamed: 0,input_text,target_text
0,nine,thirty
1,nine thirty,am
2,nine thirty am,ko
3,nine thirty am ko,Sunday
4,nine thirty am ko Sunday,Brunch


In [None]:
from sklearn.model_selection import train_test_split

# Split the dataframe into training and test data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input_text']
        target_text = self.data.iloc[idx]['target_text']

        input_encoding = self.tokenizer("predict the next word: " + input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length)
        labels = self.tokenizer(target_text, return_tensors="pt", padding="max_length", truncation=True, max_length=self.max_length).input_ids

        return {
            'input_ids': input_encoding.input_ids.flatten(),
            'attention_mask': input_encoding.attention_mask.flatten(),
            'labels': labels.flatten()
        }


In [None]:
train_dataset = CustomDataset(train_df, tokenizer)
test_dataset = CustomDataset(test_df, tokenizer)


# Define hyperparameters
epochs = 2
batch_size = 4
learning_rate = 1e-4

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Move model to the appropriate device
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
# Define optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np

# Lists to store training loss values
train_losses = []

# Training loop
for epoch in range(epochs):
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    total_loss = 0.0

    # Wrap the train_loader with tqdm for the progress bar
    with tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as t:
        for batch in t:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            # Update the progress bar with the current loss
            t.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)  # Store the average loss for this epoch
    print(f'Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}')
    scheduler.step()

    # Save the model after each epoch
    torch.save(model.state_dict(), f'model_epoch_{epoch+1}.pt')


Epoch 1/3: 100%|██████████| 11359/11359 [14:31<00:00, 13.04batch/s, loss=0.0406]


Epoch 1/3, Average Loss: 0.0850


Epoch 2/3: 100%|██████████| 11359/11359 [14:47<00:00, 12.80batch/s, loss=0.0515]


Epoch 2/3, Average Loss: 0.0640


Epoch 3/3: 100%|██████████| 11359/11359 [15:01<00:00, 12.60batch/s, loss=0.116]


Epoch 3/3, Average Loss: 0.0543


In [None]:
# Plot the training loss
plt.figure(figsize=(10, 5))
plt.plot(np.arange(1, epochs + 1), train_losses, label='Training Loss')
plt.title('Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.xticks(np.arange(1, epochs + 1))
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import math

def calculate_perplexity(model, dataloader, device):
    total_loss = 0.0
    total_words = 0

    model.eval()  # Set the model to evaluation mode

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Count the total number of words in the dataset
            total_words += labels.numel()

    avg_loss = total_loss / len(dataloader)
    perplexity = math.exp(avg_loss)  # Calculate perplexity from average loss

    return perplexity

# Assuming you have a separate DataLoader for evaluation named eval_loader
test_loader = DataLoader(test_dataset, batch_size=batch_size)
eval_perplexity = calculate_perplexity(model, test_loader, device)
print(f"Evaluation Perplexity: {eval_perplexity:.2f}")


In [None]:
# Manually test the model
def test_model(input_text, model, tokenizer, max_length=64):
    # Tokenize the input text
    input_text = "predict the next word: " + input_text
    input_encoding = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=max_length)

    # Pass the tokenized input through the model
    input_ids = input_encoding.input_ids.to(device)
    attention_mask = input_encoding.attention_mask.to(device)

    with torch.no_grad():
        output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length, num_beams=4, early_stopping=True)

    # Decode the model output
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    return decoded_output

# Example input text for testing
input_text = "nine thirty"

# Test the model
generated_output = test_model(input_text, model, tokenizer)
print("Generated Output:", generated_output)


Generated Output: baje
