In [2]:
import numpy as np
import pandas as pd

import os
import matplotlib.pyplot as plt

import re

from transformers import T5Tokenizer, T5ForConditionalGeneration




In [3]:
import torch
print(f'PyTorch version: {torch.__version__}')
print('*'*10)
print(f'_CUDA version: ')
!nvcc --version
print('*'*10)
print(f'CUDNN version: {torch.backends.cudnn.version()}')
print(f'Available GPU devices: {torch.cuda.device_count()}')
print(f'Device Name: {torch.cuda.get_device_name()}')

PyTorch version: 2.0.0+cu118
**********
_CUDA version: 
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Wed_Feb__8_05:53:42_Coordinated_Universal_Time_2023
Cuda compilation tools, release 12.1, V12.1.66
Build cuda_12.1.r12.1/compiler.32415258_0
**********
CUDNN version: 8700
Available GPU devices: 1
Device Name: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [4]:
import torch
device = torch.device('cuda:0') 
print(torch.cuda.is_available())

True


In [5]:
from pytorch_lightning.loggers import TensorBoardLogger
tb_logger = TensorBoardLogger('T5logs/', name = 'T5_local')

In [6]:
DIR = 'C:\\Users\\landon.morin\\OneDrive - The Trade Desk\\Documents\\JupyterAnalysis\\T5'
os.chdir(DIR)

In [7]:
df = pd.read_csv('final_quotes.csv')

In [8]:
# Test on quotes from dataset

In [9]:
from sklearn.model_selection import train_test_split
df = df[~df.quote.isna()]
df['inputs'] = df.apply(lambda x: "Write a quote about {} from the perspective of {}".format(x['tags'], x['auth']), axis=1)
df = df[df['quote'].apply(lambda x: len(x.split()) <= 50)]

df = df.sample(frac=0.5)
train_df, test_df = train_test_split(df, test_size = 0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)


In [9]:
df

Unnamed: 0,quote,auth,tags,inputs
1976000,"We must well grow the valuable seeds, which th...",Kim Jong-un,seeds,Write a quote about seeds from the perspective...
1602,It is an absolute human certainty that no one ...,"John Joseph Powell,",soulmates,Write a quote about soulmates from the perspec...
1826384,Puns are a form of humor with words.,Guillermo Cabrera Infante,puns,Write a quote about puns from the perspective ...
336591,"Every challenge, every adversity, contains wit...",Roy Bennett,opportunity quotes,Write a quote about opportunity quotes from th...
1057165,"There are about a dozen of these gardens, more...",Robert Fortune,extensive,Write a quote about extensive from the perspec...
...,...,...,...,...
2265158,One of the most effective tools that the Chene...,Viggo Mortensen,unpatriotic,Write a quote about unpatriotic from the persp...
2446729,The intellectual power is never at rest; it is...,Giordano Bruno,apprehension,Write a quote about apprehension from the pers...
1862668,I used to diet all the time as a kid. I starte...,Maye Musk,recommend,Write a quote about recommend from the perspec...
429686,You matter so much you're made of it.,Connor Chalfant,motivational,Write a quote about motivational from the pers...


In [13]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import datasets

tokenizer = T5Tokenizer.from_pretrained('t5-large')
input_length = 15
output_length = 50

class QuotesDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.input_length = input_length
        self.output_lenght = output_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        inputs = self.dataframe.iloc[index]['inputs']
        output = self.dataframe.iloc[index]['quote']
        model_input = tokenizer(inputs, max_length=input_length, padding="max_length", truncation=True)
        quote = tokenizer(output, max_length=output_length, padding="max_length", truncation=True).input_ids

        labels_with_ignore_index = [label if label != 0 else -100 for label in quote]
        
        model_input["labels"] = np.array([labels_with_ignore_index])
        model_input["input_ids"] = np.array([model_input["input_ids"]])
        model_input['attention_mask'] = np.array([model_input["attention_mask"]])
        return model_input


train_ds = QuotesDataset(train_df)
test_ds = QuotesDataset(test_df)
val_ds = QuotesDataset(val_df)
dataset_dict = datasets.DatasetDict({'train': train_ds, 'test': test_ds, 'val': val_ds})

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [14]:
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate


train_dataloader = DataLoader(dataset_dict['train'], shuffle=True, batch_size=16)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=8)
validation_dataloader = DataLoader(dataset_dict['val'], batch_size=8)


In [12]:
dataset_dict['train'][0]

{'input_ids': array([[8733,    3,    9, 5035,   81, 3251,   45,    8, 3503,   13,  472,
          53,    6,    1,    0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]]), 'labels': array([[ 6404,     8,  2249,    13,  1053,     6,   334,  3251,    19,
          165,   293, 19375,   535,   599, 12146, 13029,  2003,  2255,
         1836,  6952,  9428,    61,     1,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100]])}

In [15]:
from transformers import Adafactor, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class quoteT5(pl.LightningModule):
    def __init__(self, lr=5e-3, num_train_epochs=3, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids.squeeze(1), attention_mask=attention_mask.squeeze(1), labels=labels.squeeze(1).to(torch.int64))
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss, on_epoch=True, logger=True)

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = Adafactor(self.parameters(), relative_step=True, lr=None, warmup_init=True)
        # create learning rate scheduler
        
        return {"optimizer": optimizer}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return validation_dataloader

    def test_dataloader(self):
        return test_dataloader

In [14]:
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning import Trainer

model = quoteT5()

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

trainer = Trainer(accelerator="gpu", devices=1, default_root_dir=DIR,  
                  callbacks=[early_stop_callback], logger=tb_logger, max_epochs=20)

trainer.fit(model)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3050 Ti Laptop GPU') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=20` reached.


In [15]:
torch.save(model, DIR + '/modelvf.pt')

In [16]:
trained = torch.load(DIR + '/modelvf.pt')
trained.to(device)

quoteT5(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=204

In [24]:
def generate_quote(request):

    inputs_encoding =  tokenizer(
        request,
        add_special_tokens=True,
        max_length= input_length,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )

    
    generate_ids = trained.model.generate(
        input_ids = inputs_encoding["input_ids"].to(device),
        attention_mask = inputs_encoding["attention_mask"].to(device),
        max_length = output_length,
        num_beams = 2,
        num_return_sequences = 1,
        no_repeat_ngram_size=3,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]

    return "".join(preds)

In [30]:
generate_quote('write a quote about love from the perspective of Toni Morrison')

'Along with the idea of romantic love, she was introduced to another--physical beauty. Probably the most destructive ideas in the history of human thought. Both originated in envy, thrived in insecurity, and ended in disillusion.'

# Test



In [37]:
from tqdm import tqdm

test = test_df.sample(250)

In [88]:
def generate_quote(request, beam=4, ngram=3):

    inputs_encoding =  tokenizer(
        request,
        add_special_tokens=True,
        max_length= input_length,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )

    
    generate_ids = trained.model.generate(
        input_ids = inputs_encoding["input_ids"].to(device),
        attention_mask = inputs_encoding["attention_mask"].to(device),
        max_length = output_length,
        num_beams = beam,
        num_return_sequences = 1,
        no_repeat_ngram_size=ngram,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]


    max_length = 50
    stride = 512
    seq_len = inputs_encoding.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = inputs_encoding.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = trained.model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
     
    return ("".join(preds), ppl)

In [89]:
# trained = T5ForConditionalGeneration.from_pretrained('t5-large')
# trained.to(device)
import itertools
from nltk.translate.gleu_score import sentence_gleu
from nltk.translate.bleu_score import sentence_bleu

beam_widths = [2,3,4]
n_grams = [2,3,4]

results_no_tune = {}

for beam_width, n_gram in itertools.product(beam_widths, n_grams):
  results = []
  perplexities = []
  for idx, row in test.iterrows():
    result, perplexity = generate_quote(row['inputs'], beam=beam_width, ngram=n_gram)
    results.append(result)
    perplexities.append(float(perplexity))
      # Do something with result and perplexity
    if idx % 25 == 0:
      torch.cuda.empty_cache()
    generated_quotes = results
    test_set = test.quote
    reference_quotes = [quote.split() for quote in test_set]
    generated_quotes = [quote.split() for quote in generated_quotes]
    
    bleu = []
    gleu = []

    for i in range(len(generated_quotes)): 
      bleu.append(sentence_bleu([reference_quotes[i]], generated_quotes[i]))
      gleu.append(sentence_gleu([reference_quotes[i]], generated_quotes[i]))
    # store the results in the dictionary

    bleu = np.array(bleu)
    gleu = np.array(gleu)
    perplexity = np.array(perplexities)

    results_no_tune[str((beam_width, n_gram))] = {'GLEU': gleu.mean(), 'BLEU': bleu.mean(), 'PERPLEXITY': perplexity.mean()}
    
# analyze the results to determine the best combination of beam_width and n-gram
best_combination_gleu = max(results_no_tune, key=lambda x: results_no_tune[x]['GLEU'])
best_combination_bleu = max(results_no_tune, key=lambda x: results_no_tune[x]['BLEU'])
best_combination_perp = min(results_no_tune, key=lambda x: results_no_tune[x]['PERPLEXITY'])

  0%|          | 0/1 [00:00<?, ?it/s]
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 

In [90]:
results_no_tune

{'(2, 2)': {'GLEU': 0.08439026487425447,
  'BLEU': 0.05240492812975254,
  'PERPLEXITY': 4806.90025113678},
 '(2, 3)': {'GLEU': 0.0731742067528847,
  'BLEU': 0.041855033505065475,
  'PERPLEXITY': 11002.450739242553},
 '(2, 4)': {'GLEU': 0.08149212088051752,
  'BLEU': 0.05125370953836397,
  'PERPLEXITY': 11852.441572563172},
 '(3, 2)': {'GLEU': 0.07663820690054264,
  'BLEU': 0.04454969938001393,
  'PERPLEXITY': 12333.499516220092},
 '(3, 3)': {'GLEU': 0.08681621648699692,
  'BLEU': 0.05576562141546818,
  'PERPLEXITY': 9336.210585083008},
 '(3, 4)': {'GLEU': 0.08076999061874321,
  'BLEU': 0.05324421201795215,
  'PERPLEXITY': 3967.1731600875855},
 '(4, 2)': {'GLEU': 0.07991859088808258,
  'BLEU': 0.04935589625768718,
  'PERPLEXITY': 9401.864163276672},
 '(4, 3)': {'GLEU': 0.08302766605961837,
  'BLEU': 0.05153099092740363,
  'PERPLEXITY': 9319.938704208374},
 '(4, 4)': {'GLEU': 0.09321239557916301,
  'BLEU': 0.06509224866351843,
  'PERPLEXITY': 5011.496326423645}}

In [119]:
generate_quote(test.iloc[8].inputs, 2, 3)

  0%|          | 0/1 [00:00<?, ?it/s]


('She was the wand of a butterfly,and she was the queen.',
 tensor(46.7913, device='cuda:0'))

In [117]:
test.iloc[8]

quote     I love you because no two snowflakes are alike...
auth                                         Nikki Giovanni
tags                                                 poetry
inputs    Write a quote about poetry from the perspectiv...
Name: 185945, dtype: object

In [1]:
test.iloc[8]

NameError: name 'test' is not defined