In [1]:
!pip install transformers -q

import numpy as np
import pandas as pd
import time
import torch
from torch import cuda
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset

from transformers import T5Tokenizer, T5ForConditionalGeneration

[K     |████████████████████████████████| 778kB 5.3MB/s 
[K     |████████████████████████████████| 890kB 35.0MB/s 
[K     |████████████████████████████████| 3.0MB 43.0MB/s 
[K     |████████████████████████████████| 1.1MB 46.3MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [2]:
# enter model name: 'bart', 'distil_bart', 't5_small', 't5_base'
MODEL = 't5_small'

if MODEL in ['t5_small', 't5_base']:
  from transformers import T5Tokenizer, T5ForConditionalGeneration
if MODEL in ['bart', 'distil_bart']:
  from transformers import BartTokenizer, BartForConditionalGeneration

In [3]:
configs = {'batch_size': 2,
           'train_split': 0.95,
          'epochs': {
              'bart': 5,
              'distil_bart':10,
              't5_base': 10,
              't5_small': 1
          },
          'lr': {
              'bart': 1e-6,
              'distil_bart': 5e-7,
              't5_base': 1e-4,
              't5_small': 1e-4
          },
          'seed': 42,
          'max_review_len': 600,
          'max_summary_len': 30 
          }
models = {
    'bart': 'facebook/bart-large-xsum',
    'distil_bart': 'sshleifer/distilbart-xsum-12-3',
    't5_base': 't5-base',
    't5_small': 't5-small'
}

In [None]:
df = pd.read_csv('cleaned_reviews.csv')
df = df[['reviewText','summary']]
if MODEL in ['t5_base', 't5_small']:
  df.reviewText = 'summarize: ' + df.reviewText + ' </s>'
  df.summary = df.summary + ' </s>'
else:
  df.reviewText = 'summarize: ' + df.reviewText
df.head()

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
!nvidia-smi

In [None]:
if MODEL in ['t5_small', 't5_base']:
  model = T5ForConditionalGeneration.from_pretrained(models[MODEL])
  tokenizer = T5Tokenizer.from_pretrained(models[MODEL])
if MODEL in ['bart', 'distil_bart']:
  model = BartForConditionalGeneration.from_pretrained(models[MODEL])
  tokenizer = BartTokenizer.from_pretrained(models[MODEL])
  
model = model.to(device)

In [11]:
#Define a custom dataset for the dataloader
class BookReviewsDataset(Dataset):

    def __init__(self, dataframe, tokenizer, input_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.input_len = input_len
        self.summ_len = summ_len
        self.summary = self.data.summary
        self.reviewText = self.data.reviewText

    def __len__(self):
        return len(self.summary)

    def __getitem__(self, index):
        reviewText = ' '.join(str(self.reviewText[index]).split())
        summary = ' '.join(str(self.summary[index]).split())

        input = self.tokenizer.batch_encode_plus([reviewText], 
                                                  max_length=self.input_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt', 
                                                  truncation = True)
        target = self.tokenizer.batch_encode_plus([summary], 
                                                  max_length = self.summ_len, 
                                                  pad_to_max_length=True,
                                                  return_tensors='pt', 
                                                  truncation = True)

        input_ids = input['input_ids'].squeeze()
        input_mask = input['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'input_ids': input_ids.to(dtype=torch.long), 
            'attn_mask': input_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
        }

In [12]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        target_ids = data['target_ids'].to(device, dtype = torch.long)
        decoder_input_ids = target_ids[:, :-1].contiguous()
        lm_labels = target_ids[:, 1:].clone().detach()
        lm_labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100
        input_ids = data['input_ids'].to(device, dtype = torch.long)
        attn_mask = data['attn_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = input_ids, 
                        attention_mask = attn_mask, 
                        decoder_input_ids=decoder_input_ids, 
                        lm_labels=lm_labels)
        
        loss = outputs[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [13]:
def make_predictions(epoch, tokenizer, model, device, loader):
    model.eval()
    model_generated_summaries = []
    user_summaries = []
    with torch.no_grad():

        for _, data in enumerate(loader, 0):
            target_ids = data['target_ids'].to(device, dtype = torch.long)
            input_ids = data['input_ids'].to(device, dtype = torch.long)
            attn_mask = data['attn_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = input_ids,
                attention_mask = attn_mask, 
                max_length=30, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.8, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in target_ids]

            model_generated_summaries.extend(preds)
            user_summaries.extend(target)
    return model_generated_summaries, user_summaries

In [None]:
torch.manual_seed(configs['seed'])
np.random.seed(configs['seed']) 
torch.backends.cudnn.deterministic = True

train_dataset=df.sample(frac=configs['train_split'], random_state = configs['seed']).reset_index(drop=True)
validation_dataset=df.drop(train_dataset.index).reset_index(drop=True)

#preparing the training dataset
training_set = BookReviewsDataset(train_dataset, tokenizer, configs['max_review_len'], configs['max_summary_len'])
train_params = {'batch_size': configs['batch_size'],
                'shuffle': True,
                'num_workers': 0}
training_loader = DataLoader(training_set, **train_params)

#preparing the validation dataset
validation_set = BookReviewsDataset(validation_dataset, tokenizer, configs['max_review_len'], configs['max_summary_len'])
validation_params = {
    'batch_size': configs['batch_size'],
    'shuffle': False,
    'num_workers': 0}
validation_loader = DataLoader(validation_set, **validation_params)

#initializing the optimizer
optimizer = torch.optim.Adam(params =  model.parameters(), lr=configs['lr'][MODEL])

#training
for epoch in range(configs['epochs'][MODEL]):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

#save the model to be able to load later
model.save_pretrained('./saved_{}_model/'.format(MODEL))

model_generated_summaries, user_summaries = make_predictions(epoch, tokenizer, model, device, validation_loader)
gen_summaries_df = pd.DataFrame({'model_generated_summaries':model_generated_summaries,'user_summaries':user_summaries})
gen_summaries_df.to_csv(MODEL+'_predictions.csv')

In [None]:
def display_data(row_num):
    print('\n')
    print('User Summary: ')
    with pd.option_context('display.max_colwidth', 200):
        print(gen_summaries_df['user_summaries'][row_num])
    print('=============================================')
    print("\n")
    print('Fine Tuned Generated Summary: ')
    with pd.option_context('display.max_colwidth', 200):
        print(gen_summaries_df['model_generated_summaries'][row_num])
    print('=============================================')

In [None]:
for i in range(10):
  display_data(i)