In [None]:
#@title
!pip install transformers
!pip install SentencePiece
!pip install datasets
!pip install pytorch_lightning


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1
Looking in indexes: https://pypi.org/simple, http

In [5]:
#@title
import numpy as np
import pandas as pd

import os
import matplotlib.pyplot as plt

import re
import itertools

import torch


from nltk.translate.gleu_score import corpus_gleu, sentence_gleu
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu


from transformers import T5Tokenizer, TFT5Model, T5ForConditionalGeneration
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate


from torch.utils.data import Dataset, DataLoader
import datasets

from transformers import Adafactor, get_linear_schedule_with_warmup
import pytorch_lightning as pl

from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning import Trainer

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#@title
device = torch.device('cuda:0')

NameError: ignored

# Importing, Cleaning, and Tokenizing

In [6]:
#@title
DIR = '/content/drive/MyDrive/Colab Notebooks/w266/Final_Project'
os.chdir(DIR)

In [16]:
#@title
df = pd.read_csv('final_quotes.csv')

In [17]:
#@title
# Test on quotes from dataset

In [18]:
#@title
df = df[~df.quote.isna()]
df = df[df['quote'].str.split().apply(len) <= 50]
df['inputs'] = df.apply(lambda x: "Write a quote about {} from the perspective of {}".format(x['tags'], x['auth']), axis=1)
df = df.sample(frac=0.5)
train_df, test_df = train_test_split(df, test_size = 0.2)
test_df, val_df = train_test_split(test_df, test_size=0.5)


In [19]:
#@title
len(train_df.quote.unique())

413784

In [None]:
#@title
test_df[test_df['auth'] == 'Toni Morrison'].iloc[0]['quote']

'I wrote my first novel because I wanted to read it.'

In [None]:
#@title


tokenizer = T5Tokenizer.from_pretrained('t5-large')
input_length = 15
output_length = 50

class QuotesDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.input_length = input_length
        self.output_lenght = output_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        inputs = self.dataframe.iloc[index]['inputs']
        output = self.dataframe.iloc[index]['quote']
        model_input = tokenizer(inputs, max_length=input_length, padding="max_length", truncation=True)
        quote = tokenizer(output, max_length=output_length, padding="max_length", truncation=True).input_ids

        labels_with_ignore_index = [label if label != 0 else -100 for label in quote]
        
        model_input["labels"] = np.array([labels_with_ignore_index])
        model_input["input_ids"] = np.array([model_input["input_ids"]])
        model_input['attention_mask'] = np.array([model_input["attention_mask"]])
        return model_input


train_ds = QuotesDataset(train_df)
test_ds = QuotesDataset(test_df)
val_ds = QuotesDataset(val_df)
dataset_dict = datasets.DatasetDict({'train': train_ds, 'test': test_ds, 'val': val_ds})

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [None]:
#@title


train_dataloader = DataLoader(dataset_dict['train'], shuffle=True, batch_size=256, num_workers=2)
test_dataloader = DataLoader(dataset_dict['test'], batch_size=128, num_workers=2)
validation_dataloader = DataLoader(dataset_dict['val'], batch_size=128, num_workers=2)


# The Model

In [None]:
#@title
class quoteT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=3, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("t5-small")
        self.save_hyperparameters()

        self.train_losses = []
        self.val_losses = []


    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids.squeeze(1), attention_mask=attention_mask.squeeze(1), labels=labels.squeeze(1))
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss

        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        # logs metrics for each training_step,
        # and the average across the epoch
        self.log("training_loss", loss)
        self.train_losses.append(loss.item())

        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)
        self.val_losses.append(loss.item())

        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     

        return loss

    def configure_optimizers(self):
        # create optimizer
        optimizer = Adafactor(self.parameters(), relative_step=True, warmup_init=True, lr=None)
        # create learning rate scheduler        
        return {"optimizer": optimizer}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return validation_dataloader

    def test_dataloader(self):
        return test_dataloader

In [None]:
#@title

model = quoteT5()

early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)

trainer = Trainer(accelerator="gpu", default_root_dir=DIR,  
                  callbacks=[early_stop_callback], max_epochs=20)

trainer.fit(model)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

quoteT5(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 512)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 512)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=512, out_features=512, bias=False)
                (k): Linear(in_features=512, out_features=512, bias=False)
                (v): Linear(in_features=512, out_features=512, bias=False)
                (o): Linear(in_features=512, out_features=512, bias=False)
                (relative_attention_bias): Embedding(32, 8)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseActDense(
                (wi): Linear(in_features=512, out_features=2048, bias=False)
                (wo): Linear(in_features=204

In [None]:
#@title

# plot the training and validation loss curve
plt.plot(trained_part.train_losses, label="Training loss")
plt.plot(trained_part.val_losses, label="Validation loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
#@title
torch.save(trained_part, DIR + '/model/model4-base-2final.pt')

In [None]:
#@title
