In [1]:
from IPython.display import clear_output

!pip install transformers
!pip install datasets
!pip install torchtext
!pip3 install tensorflow_text
!pip3 install urllib3==1.25.4
!pip install sentencepiece

clear_output()

In [2]:
import os
import re
import torch
import pprint
import torch.nn as nn
import numpy as np
from transformers import T5Model, T5TokenizerFast, T5ForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from pathlib import Path
from tqdm import tqdm 

In [3]:
!wget https://www.dropbox.com/s/c59eku24hjwo5xy/GYAFC_Corpus.zip && unzip GYAFC_Corpus.zip

clear_output()

In [4]:
train_formal, train_informal = [], []
valid_formal, valid_informal = [], []
test_formal, test_informal = [], []
with open('GYAFC_Corpus/Entertainment_Music/train/formal', 'r', encoding='utf-8') as file:
    train_formal = file.readlines()
with open('GYAFC_Corpus/Entertainment_Music/train/informal', 'r', encoding='utf-8') as file:
    train_informal = file.readlines()
    
with open('GYAFC_Corpus/Entertainment_Music/tune/formal', 'r', encoding='utf-8') as file:
    valid_formal = file.readlines()
with open('GYAFC_Corpus/Entertainment_Music/tune/informal.ref0', 'r', encoding='utf-8') as file:
    valid_informal = file.readlines()
    
with open('GYAFC_Corpus/Entertainment_Music/test/formal', 'r', encoding='utf-8') as file:
    test_formal = file.readlines()
with open('GYAFC_Corpus/Entertainment_Music/test/informal.ref0', 'r', encoding='utf-8') as file:
    test_informal = file.readlines()

train_formal = [re.sub('\n', '', x) for x in train_formal]
train_informal = [re.sub('\n', '', x) for x in train_informal]
valid_formal = [re.sub('\n', '', x) for x in valid_formal]
valid_informal = [re.sub('\n', '', x) for x in valid_informal]
test_formal = [re.sub('\n', '', x) for x in test_formal]
test_informal = [re.sub('\n', '', x) for x in test_informal]

delimiter = ' >>> '
traindata = [x + delimiter + y for x, y in zip(train_formal, train_informal)]
validdata = [x + delimiter + y for x, y in zip(valid_formal, valid_informal)]
testdata = [x + delimiter + y for x, y in zip(test_formal, test_informal)]

# with open('train.txt', 'w') as f:
#     f.write("\n".join(train))

# with open('valid.txt', 'w') as f:
#     f.write("\n".join(valid))

# with open('test.txt', 'w') as f:
#     f.write("\n".join(test))

In [5]:
class T5Data(torch.utils.data.Dataset):
  """
  Creating a custom dataset for reading the dataset and 
  loading it into the dataloader to pass it to the neural network for finetuning the model

  """

  def __init__(self, tokenizer, texts):
    self.tokenizer = tokenizer
    self.texts = texts

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    source_text = self.texts[index].split(' >>> ')[0]
    target_text = self.texts[index].split(' >>> ')[-1]

    source = self.tokenizer.encode_plus(
        source_text,
        max_length=130,
        add_special_tokens=True,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors='pt'
        )
    target = self.tokenizer.encode_plus(
        target_text,
        max_length=130,
        add_special_tokens=True,
        pad_to_max_length=True,
        truncation=True,
        padding="max_length",
        return_tensors='pt'
        )

    source_ids = source['input_ids'].squeeze()
    source_mask = source['attention_mask'].squeeze()
    target_ids = target['input_ids'].squeeze()
    target_mask = target['attention_mask'].squeeze()

    return {
        'source_ids': source_ids.to(dtype=torch.long), 
        'source_mask': source_mask.to(dtype=torch.long), 
        'target_ids': target_ids.to(dtype=torch.long),
        'target_ids_y': target_ids.to(dtype=torch.long)
    }

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer, scheduler):
    
    model.train()
    losses = []
    for batch in tqdm(loader):
        optimizer.zero_grad()

        y = batch['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        
        ids = batch['source_ids'].to(device, dtype = torch.long)
        mask = batch['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        
        loss.backward()
        optimizer.step()
        scheduler.step()

        losses.append(loss.item())

    return losses

In [7]:
def validate(epoch, tokenizer, model, device, loader):

    model.eval()
    predictions, gts = [], []
    with torch.no_grad():
        for batch in tqdm(loader):
            y = batch['target_ids'].to(device, dtype=torch.long)
            ids = batch['source_ids'].to(device, dtype=torch.long)
            mask = batch['source_mask'].to(device, dtype=torch.long)

            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

            predictions.extend(preds)
            gts.extend(target)

    return predictions, gts

In [8]:
num_epochs = 3

In [9]:
from transformers import AdamW, get_linear_schedule_with_warmup

#set seeds
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
#set device
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

# model = T5ForConditionalGeneration.from_pretrained('t5-base')
# model = model.to(device)
# tokenizer = T5TokenizerFast.from_pretrained('t5-base')

tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws")  
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws")
model = model.to(device)

#datasets
trainset = T5Data(tokenizer=tokenizer, texts=traindata)
valset = T5Data(tokenizer=tokenizer, texts=validdata)

#dataloaders
trainloader = torch.utils.data.DataLoader(trainset, batch_size=16, num_workers=2, shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=16, num_workers=2, shuffle=False)


#optimizer
# optimizer = torch.optim.Adam(params=model.parameters(), lr=5e-5, weight_decay=1e-2)
# epochs = 3
# learning_rate = 5e-4
# warmup_steps = 1e2
# epsilon = 1e-8

optimizer = AdamW(model.parameters(), lr=5e-4, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(trainloader)*num_epochs)

for epoch in range(num_epochs):
    epoch_loss = train(epoch, tokenizer, model, device, trainloader, optimizer, scheduler)
    print(f'Epoch: {epoch+1}/{num_epochs}, loss: {round(np.mean(epoch_loss), 3)}')

    preds, gts = validate(epoch, tokenizer, model, device, valloader)
    print('Generated text:')
    print(f'Generated: {preds[-1]}')
    print(f'Original: {gts[-1]}')

    print('epoch end' + '%'*30)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1208.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1786.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=25.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=891689022.0, style=ProgressStyle(descri…




100%|██████████| 3288/3288 [54:21<00:00,  1.01it/s]
  0%|          | 0/148 [00:00<?, ?it/s]

Epoch: 1/3, loss: 2.022


100%|██████████| 148/148 [01:53<00:00,  1.31it/s]
  0%|          | 0/3288 [00:00<?, ?it/s]

Generated text:
Generated: I liked everything except Big Ben blwoing up.
Original: I personally didnt like Big Ben blowing up, but otherwise i liked it.
epoch end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


100%|██████████| 3288/3288 [54:30<00:00,  1.01it/s]
  0%|          | 0/148 [00:00<?, ?it/s]

Epoch: 2/3, loss: 1.595


100%|██████████| 148/148 [01:46<00:00,  1.39it/s]
  0%|          | 0/3288 [00:00<?, ?it/s]

Generated text:
Generated: I liked everything except Big Ben blwoing up.
Original: I personally didnt like Big Ben blowing up, but otherwise i liked it.
epoch end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


100%|██████████| 3288/3288 [54:25<00:00,  1.01it/s]
  0%|          | 0/148 [00:00<?, ?it/s]

Epoch: 3/3, loss: 1.347


100%|██████████| 148/148 [01:46<00:00,  1.40it/s]

Generated text:
Generated: I liked everything except Big Ben blwoing up.
Original: I personally didnt like Big Ben blowing up, but otherwise i liked it.
epoch end%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





In [10]:
model.save_pretrained('./T5_paraphrase_new_music')
tokenizer.save_pretrained('./T5_paraphrase_new_music')

('./T5_paraphrase_new_music/tokenizer_config.json',
 './T5_paraphrase_new_music/special_tokens_map.json',
 './T5_paraphrase_new_music/spiece.model',
 './T5_paraphrase_new_music/added_tokens.json',
 './T5_paraphrase_new_music/tokenizer.json')

In [11]:
!cp -r T5_paraphrase_new_music -d drive/MyDrive/T5/

In [22]:
#set seeds
torch.manual_seed(42)
np.random.seed(42)
torch.backends.cudnn.deterministic = True
#set device
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'

model = T5ForConditionalGeneration.from_pretrained('Vamsi/T5_Paraphrase_Paws')
model = model.to(device)
tokenizer = T5TokenizerFast.from_pretrained('Vamsi/T5_Paraphrase_Paws')

In [23]:
#datasets
trainset = T5Data(tokenizer=tokenizer, texts=traindata)
valset = T5Data(tokenizer=tokenizer, texts=validdata)
testset = T5Data(tokenizer=tokenizer, texts=testdata)

In [None]:
model = T5ForConditionalGeneration.from_pretrained('drive/MyDrive/T5/T5_paraphrase_new')
model = model.to(device)
tokenizer = T5TokenizerFast.from_pretrained('drive/MyDrive/T5/T5_paraphrase_new')

In [24]:
model.eval()

results = []
for sentence in tqdm(testset):
    generated_ids = model.generate(
        input_ids=sentence['source_ids'].unsqueeze(0).cuda().long(), 
        top_k=5,
        top_p=0.75,
        temperature=10.,
        do_sample=True,
        num_beams=10,
        ).squeeze(0)

    results.append(
        tokenizer.decode(
            generated_ids,
            clean_up_tokenization_spaces=True,
            skip_special_tokens=True
            )
        )

100%|██████████| 1082/1082 [07:35<00:00,  2.38it/s]


In [26]:
with open('results_t5_music_zeroshot.txt', 'w') as file:
    file.write('\n'.join(x for x in results))

In [27]:
!cp  results_t5_music_zeroshot.txt drive/MyDrive/T5