In [None]:
!pip install transformers
!pip install SentencePiece
!pip install datasets
!pip install pytorch_lightning


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m102.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.0
Looking in indexes: https://pypi.org/simple, htt

In [None]:
import numpy as np
import pandas as pd

import os
import matplotlib.pyplot as plt

import re
import itertools

import torch


from nltk.translate.gleu_score import corpus_gleu, sentence_gleu
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu


from transformers import T5Tokenizer, TFT5Model, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate


from torch.utils.data import Dataset, DataLoader
import datasets

from transformers import Adafactor, get_linear_schedule_with_warmup
import pytorch_lightning as pl

from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning import Trainer

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing, Cleaning, and Tokenizing

In [None]:
DIR = '/content/drive/MyDrive/Colab Notebooks/w266/Final_Project'
os.chdir(DIR)

In [None]:
df = pd.read_csv('final_quotes.csv')

In [None]:
# Test on quotes from dataset

In [None]:
df = df[~df.quote.isna()]
df = df[df['quote'].str.split().apply(len) <= 50]
df['inputs'] = df.apply(lambda x: "Write a quote about {} from the perspective of {}".format(x['tags'], x['auth']), axis=1)
df = df.sample(frac=0.5)
train_df, test_df = train_test_split(df, test_size = 0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)


In [None]:
test_df[test_df['auth'] == 'Toni Morrison'].iloc[0]['quote']

"I think some aspects of writing can be taught. Obviously, you can't teach vision or talent. But you can help with comfort."

In [None]:


tokenizer = T5Tokenizer.from_pretrained('t5-large')
input_length = 15
output_length = 50

class QuotesDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.input_length = input_length
        self.output_lenght = output_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        inputs = self.dataframe.iloc[index]['inputs']
        output = self.dataframe.iloc[index]['quote']
        model_input = tokenizer(inputs, max_length=input_length, padding="max_length", truncation=True)
        quote = tokenizer(output, max_length=output_length, padding="max_length", truncation=True).input_ids

        labels_with_ignore_index = [label if label != 0 else -100 for label in quote]
        
        model_input["labels"] = np.array([labels_with_ignore_index])
        model_input["input_ids"] = np.array([model_input["input_ids"]])
        model_input['attention_mask'] = np.array([model_input["attention_mask"]])
        return model_input


train_ds = QuotesDataset(train_df)
test_ds = QuotesDataset(test_df)
val_ds = QuotesDataset(val_df)
dataset_dict = datasets.DatasetDict({'train': train_ds, 'test': test_ds, 'val': val_ds})

In [None]:


train_dataloader = DataLoader(dataset_dict['train'], shuffle=True, batch_size=256, num_workers=2)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=128, num_workers=2)
validation_dataloader = DataLoader(dataset_dict['val'], batch_size=128, num_workers=2)


# The Model

In [None]:
class quoteT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=3, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-base")
        self.save_hyperparameters()

        self.train_losses = []
        self.val_losses = []


    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids.squeeze(1), attention_mask=attention_mask.squeeze(1), labels=labels.squeeze(1))
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        self.train_losses.append(loss.item())

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)
        self.val_losses.append(loss.item())

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = Adafactor(self.parameters(), relative_step=True, warmup_init=True, lr=None)
        # create learning rate scheduler        
        return {"optimizer": optimizer}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return validation_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:

model = quoteT5()

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

trainer = Trainer(accelerator="gpu", default_root_dir=DIR,  
                  callbacks=[early_stop_callback], max_epochs=15)

trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

In [None]:

# plot the training and validation loss curve
plt.plot(model.train_losses, label="Training loss")
plt.plot(model.val_losses, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
torch.save(model, DIR + '/model/model4-base-2.pt')

In [None]:
trained = torch.load(DIR + '/model/model4-large5.pt')
trained.to(device)

quoteT5(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 1024)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 1024)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=1024, out_features=1024, bias=False)
                (k): Linear(in_features=1024, out_features=1024, bias=False)
                (v): Linear(in_features=1024, out_features=1024, bias=False)
                (o): Linear(in_features=1024, out_features=1024, bias=False)
                (relative_attention_bias): Embedding(32, 16)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=1024, out_features=4096, bias=False)
                (wo): Linear(in_

In [None]:
# def generate_quote(request, beam=4, ngram=3):

#     inputs_encoding =  tokenizer(
#         request,
#         add_special_tokens=True,
#         max_length= input_length,
#         padding = 'max_length',
#         truncation='only_first',
#         return_attention_mask=True,
#         return_tensors="pt"
#         )

    
#     generate_ids = trained.model.generate(
#         input_ids = inputs_encoding["input_ids"].to(device),
#         attention_mask = inputs_encoding["attention_mask"].to(device),
#         max_length = output_length,
#         num_beams = beam,
#         num_return_sequences = 1,
#         no_repeat_ngram_size=ngram,
#         early_stopping=True,
#         )

#     preds = [
#         tokenizer.decode(gen_id,
#         skip_special_tokens=True, 
#         clean_up_tokenization_spaces=True)
#         for gen_id in generate_ids
#     ]

#     return "".join(preds) 

In [None]:

# def perplexity(sequence, target_sequence, beam=4, ngram=3):

#     # Tokenize the sequence
#     inputs = tokenizer(
#             sequence,
#             add_special_tokens=True,
#             max_length= input_length,
#             padding = 'max_length',
#             truncation='only_first',
#             return_attention_mask=True,
#             return_tensors="pt"
#             )

#     targets = tokenizer(
#         target_sequence,
#         add_special_tokens=True,
#         max_length=output_length,
#         padding='max_length',
#         truncation='only_first',
#         return_attention_mask=True,
#         return_tensors="pt"
#     )

#     # Generate output sequence using T5 model
#     output = trained.model.generate(
#             input_ids = inputs["input_ids"].to(device),
#             attention_mask = inputs["attention_mask"].to(device),
#             max_length = output_length,
#             num_beams = beam,
#             num_return_sequences = 1,
#             no_repeat_ngram_size=ngram,
#             early_stopping=True,
#             return_dict_in_generate=True,
#             output_scores=True
#             )

#     output_tokens = tokenizer.convert_ids_to_tokens(output.sequences[0].tolist())

#     # Calculate perplexity
#     input_ids = inputs['input_ids'].squeeze()
#     output_ids = torch.tensor(tokenizer.encode(output_tokens)).unsqueeze(0)
#     logits = trained.model(input_ids=inputs["input_ids"].to(device), 
#                           attention_mask=inputs["attention_mask"].to(device),
#                           decoder_input_ids=targets["input_ids"].to(device),
#                           decoder_attention_mask=targets["attention_mask"].to(device),
#                           labels=targets["input_ids"].to(device),
#                           output_hidden_states=True).logits
#     logits = logits[:, :-1, :].contiguous()
#     labels = targets["input_ids"][:, 1:].contiguous().to(device)
#     loss_fct = torch.nn.CrossEntropyLoss()
#     loss = loss_fct(logits.view(-1, logits.shape[-1]), labels.view(-1))

#     perplexity = torch.exp(loss).item()

    # return perplexity

In [None]:

# def perplexity(sequence, beam=4, ngram=3):

#     # Tokenize the sequence
#     inputs = tokenizer(
#             sequence,
#             add_special_tokens=True,
#             max_length= input_length,
#             padding = 'max_length',
#             truncation='only_first',
#             return_attention_mask=True,
#             return_tensors="pt"
#             )


#     output_ids = trained.model.generate(inputs['input_ids'].to(device),
#                                         attention_mask = inputs_encoding["attention_mask"].to(device),
#                                         max_length = output_length,
#                                         num_beams = beam,
#                                         num_return_sequences = 1,
#                                         no_repeat_ngram_size=ngram,
#                                         early_stopping=True,
#                                         )

#     # Decode the output sequence using the tokenizer
#     output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

#     # Calculate the perplexity of the decoder only
#     decoder_input_ids = output_ids[:, :-1]
#     decoder_target_ids = output_ids[:, 1:]
#     decoder_logits = trained.model(input_ids = decoder_input_ids, decoder_input_ids = decoder_input_ids).logits
#     decoder_probs = torch.nn.functional.softmax(decoder_logits, dim=-1)
#     perplexity = torch.exp(torch.nn.functional.cross_entropy(decoder_logits.view(-1, decoder_logits.shape[-1]), decoder_target_ids.view(-1), reduction='mean'))



#     return perplexity

# Grid Search For Hyperparameter Tuning of Large Model


In [None]:
test = test_df.sample(250)

In [None]:

# beam_widths = [2]
# n_grams = [2]

# results = {}

# for beam_width, n_gram in itertools.product(beam_widths, n_grams):
#     test['generated_quote'] = test['inputs'].apply(lambda x: generate_quote(x, beam_width, n_gram))
#     test['perplexity'] = test['inputs'].apply(lambda x: perplexity(x, beam_width, n_gram))
#     # calculate BLEU and ROUGE scores for the generated quotes
#     generated_quotes = test.generated_quote
#     test_set = test.quote
#     reference_quotes = [quote.split() for quote in test_set]
#     generated_quotes = [quote.split() for quote in generated_quotes]
    
#     bleu = []
#     gleu = []

#     for i in range(len(generated_quotes)): 
#       bleu.append(sentence_bleu([reference_quotes[i]], generated_quotes[i]))
#       gleu.append(sentence_gleu([reference_quotes[i]], generated_quotes[i]))
#     # store the results in the dictionary

#     bleu = np.array(bleu)
#     gleu = np.array(gleu)

#     results[str((beam_width, n_gram))] = {'GLEU': gleu.mean(), 'BLEU': bleu.mean(), 'PERPLEXITY': test['perplexity'].mean()}
    
# # analyze the results to determine the best combination of beam_width and n-gram
# best_combination_gleu = max(results, key=lambda x: results[x]['GLEU'])
# best_combination_bleu = max(results, key=lambda x: results[x]['BLEU'])
# best_combination_perp = min(results, key=lambda x: results[x]['PERPLEXITY'])


# Grid Search For Hyperparameter Tuning of Small Model, More Examples

In [None]:
from tqdm import tqdm

def generate_quote(request, beam=4, ngram=3):

    inputs_encoding =  tokenizer(
        request,
        add_special_tokens=True,
        max_length= input_length,
        padding = 'max_length',
        truncation='only_first',
        return_attention_mask=True,
        return_tensors="pt"
        )

    
    generate_ids = trained.model.generate(
        input_ids = inputs_encoding["input_ids"].to(device),
        attention_mask = inputs_encoding["attention_mask"].to(device),
        max_length = output_length,
        num_beams = beam,
        num_return_sequences = 1,
        no_repeat_ngram_size=ngram,
        early_stopping=True,
        )

    preds = [
        tokenizer.decode(gen_id,
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=True)
        for gen_id in generate_ids
    ]


    max_length = 50
    stride = 512
    seq_len = inputs_encoding.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in tqdm(range(0, seq_len, stride)):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = inputs_encoding.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = trained.model(input_ids.to(device), labels=target_ids.to(device))

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
     
    return ("".join(preds), ppl)

In [None]:
# trained = T5ForConditionalGeneration.from_pretrained('t5-large')
# trained.to(device)

beam_widths = [2,3,4]
n_grams = [2,3,4]

results_no_tune = {}

for beam_width, n_gram in itertools.product(beam_widths, n_grams):
  results = []
  perplexities = []
  for idx, row in test_df.iterrows():
    result, perplexity = generate_quote(row['inputs'], beam=beam_width, ngram=n_gram)
    results.append(result)
    perplexities.append(float(perplexity))
      # Do something with result and perplexity
    if idx % 25 == 0:
      torch.cuda.empty_cache()
    generated_quotes = results
    test_set = test_df.quote
    reference_quotes = [quote.split() for quote in test_set]
    generated_quotes = [quote.split() for quote in generated_quotes]
    
    bleu = []
    gleu = []

    for i in range(len(generated_quotes)): 
      bleu.append(sentence_bleu([reference_quotes[i]], generated_quotes[i]))
      gleu.append(sentence_gleu([reference_quotes[i]], generated_quotes[i]))
    # store the results in the dictionary

    bleu = np.array(bleu)
    gleu = np.array(gleu)
    perplexity = np.array(perplexities)

    results_no_tune[str((beam_width, n_gram))] = {'GLEU': gleu.mean(), 'BLEU': bleu.mean(), 'PERPLEXITY': perplexity.mean()}
    
# analyze the results to determine the best combination of beam_width and n-gram
best_combination_gleu = max(results_no_tune, key=lambda x: results_no_tune[x]['GLEU'])
best_combination_bleu = max(results_no_tune, key=lambda x: results_no_tune[x]['BLEU'])
best_combination_perp = min(results_no_tune, key=lambda x: results_no_tune[x]['PERPLEXITY'])

  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|       

In [None]:
import json

with open('resultsT5large_newperp_full_test.json', 'w') as f:
    json.dump(results_no_tune, f)

In [None]:
results_no_tune

In [None]:
import os
os.kill(os.getpid(), 9)


In [None]:
torch.exp(torch.tensor(4.437067))

In [None]:
generate_quote('Write a quote about olly from the perspective of Navid Negahban', 2, 4)

In [None]:
test

In [None]:
test.iloc[249]['results']