In [None]:
import re
import random
import subprocess 
import gc
import csv
import os.path
from math import ceil

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
from torch.utils.data import DataLoader

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
from transformers import Trainer, TrainingArguments, TrainerCallback
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

## Some basic utilities for GPU mgmt

In [None]:
# plundered from: https://github.com/huggingface/transformers/issues/1742
def show_gpu(msg):
    """
    ref: https://discuss.pytorch.org/t/access-gpu-memory-usage-in-pytorch/3192/4
    """
    def query(field):
        return(subprocess.check_output(
            ['nvidia-smi', f'--query-gpu={field}',
                '--format=csv,nounits,noheader'], 
            encoding='utf-8'))
    def to_int(result):
        return int(result.strip().split('\n')[0])
    
    used = to_int(query('memory.used'))
    total = to_int(query('memory.total'))
    pct = used/total
    print('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')    

In [None]:
def clean_up_gpu():
    show_gpu('Preclean: ')
    torch.cuda.empty_cache()
    gc.collect()
    show_gpu('Postclean: ')

In [None]:
def reset_model(model):
    del model
    clean_up_gpu()
    model = AutoModelForSeq2SeqLM.from_pretrained("t5-base").to(device)
    return model

In [None]:
device = 'cuda'
show_gpu('Initial use:')

## Experimental harness

In [None]:
def read_poetry_into_lines(filename):
    file = open(filename, 'rt')
    text = file.read()
    file.close()
    text = re.sub(r'[^\w\s]','',text)
    lines = [line.strip().split(' ') for line in text.split('\n') if len(line.strip()) > 0 and not (line.isupper())]
    return lines

In [None]:
shake_sonnet_lines = read_poetry_into_lines('./data/shakespeare-sonnets.txt')
print('Number of lines: ', len(shake_sonnet_lines))
max_line_length = max([len(line) for line in shake_sonnet_lines])
print('Max length of line: ', max_line_length)

In [None]:
def output_once_off(model, tokenizer, skip_special_tokens=False):
    random_words = random.sample(list(tokenizer.vocab.keys()), 3)
    tokenized_random = tokenizer(f"generate Shakespeare: {' '.join(random_words)}", padding=True, return_tensors="pt").to(device)
    model_output = model.generate(**tokenized_random, max_length=max_line_length)
    print('Random words: ', random_words, ' and output: ', tokenizer.decode(model_output[0], skip_special_tokens=skip_special_tokens))

In [None]:
def assemble_paired_noise_lines(tokenizer, command="shakespeare", number_pairs=1, input_length=5, add_eos_token=False):
    generated_noise = []
    actual_lines = []
    max_sample = len(shake_sonnet_lines)
    number_loops = ceil(number_pairs / max_sample)
    for i in range(0, number_loops):
        loop_pairs = min(max_sample, number_pairs - len(actual_lines))
        random_set = [random.sample(list(tokenizer.vocab.keys()), input_length) for _ in range(loop_pairs)] 
        generated_noise = generated_noise + [f"{command}: {' '.join(random_words)}" for random_words in random_set]
        lines_in_sample = random.sample(shake_sonnet_lines, loop_pairs)
        if add_eos_token:
            lines_in_sample = [line + [tokenizer.eos_token] for line in lines_in_sample]
        actual_lines = actual_lines + [' '.join(line) for line in lines_in_sample]
    return generated_noise, actual_lines 

In [None]:
class NoiseToShakeDataset(torch.utils.data.Dataset):
    def __init__(self, generated_noise, paired_lines):
        self.generated_noise = generated_noise
        self.sonnet_lines = paired_lines
    
    def __getitem__(self, idx):
        return { 'tgt_texts': self.sonnet_lines[idx], 'src_texts': self.generated_noise[idx], 'id': idx }
    
    def __len__(self):
        return len(self.generated_noise)
    
    def collate_fn(self, batch):
        batch_encoding = tokenizer.prepare_seq2seq_batch(
            [x["src_texts"] for x in batch],
            tgt_texts=[x["tg_texts"] for x in batch],
            return_tensors="pt"
        )        

In [None]:
def read_dataset(number_pairs, tokenizer):
    file_path = f"./data/paired_noise_lines_{tokenizer.name_or_path}_{number_pairs}.csv"
    if not os.path.isfile(file_path):
        return False, None, None
    
    with open(file_path, newline='') as f:
        reader = csv.reader(f)
        next(reader, None)
        combined = list(reader)
        inputs, labels = map(list, zip(*combined))
                
    return True, inputs, labels

In [None]:
def write_dataset(number_pairs, tokenizer, inputs, labels):
    input_path = f"./data/paired_noise_lines_{tokenizer.name_or_path}_{number_pairs}.csv"
        
    with open(input_path, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['input', 'label'])
        writer.writerows(list(zip(inputs, labels)))    

In [None]:
def assemble_dataset(tokenizer, number_pairs, check_for_saved=False, write_generated=False, tokenizer_name=None):
    loaded_prior = False
    if check_for_saved:
        loaded_prior, inputs, labels = read_dataset(number_pairs, tokenizer)
        
    if not loaded_prior:
        inputs, labels = assemble_paired_noise_lines(tokenizer, number_pairs=number_pairs, add_eos_token=True)
    
    if write_generated and not loaded_prior:
        write_dataset(number_pairs, tokenizer, inputs, labels)
        
    return NoiseToShakeDataset(inputs, labels)

In [None]:
class NoiseToShakeDataCollator:
    def __init__(self, tokenizer, data_args=None):
        self.data_args = data_args
        self.tokenizer = tokenizer
        
    def __call__(self, batch):
        batch = self._encode(batch)
        input_ids, attention_mask, labels = (
            batch["input_ids"], batch["attention_mask"], batch["labels"]
        )
        
        batch = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
#             "decoder_input_ids": labels,
            "labels": labels
        }
        
        return batch
    
    def _encode(self, batch):
#         print(batch)
        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
            [x["src_texts"] for x in batch],
            tgt_texts=[x["tgt_texts"] for x in batch],
            padding="max_length",
            max_length=max_line_length,
            return_tensors="pt"
        )
        return batch_encoding

In [None]:
class PrintExampleCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
        output_once_off(model, tokenizer)

## Running experiments

In [None]:
def run_experiment(
    model_base, 
    tokenizer_base, 
    pretrained_name,
    training_args,
    number_training_pairs=2000,
    number_validation_pairs=100,
    add_eos_token_to_labels=False, 
    verbose=False
):
    model = model_base.from_pretrained("t5-base").to(device)
    tokenizer = tokenizer_base.from_pretrained("t5-base")
    clean_up_gpu()
    show_gpu('Loaded model, current GP use:')
    
    if verbose:
        output_once_off(model, tokenizer)
        noise, line = assemble_paired_noise_lines(tokenizer, add_eos_token=add_eos_token_to_labels)
        print(noise, line)
        print(NoiseToShakeDataset(noise, line)[0])
    
    train_dataset = assemble_dataset(tokenizer, number_pairs=number_training_pairs, check_for_saved=True, write_generated=True)
    val_dataset = assemble_dataset(tokenizer, number_pairs=number_validation_pairs, check_for_saved=True, write_generated=True)
    
    clean_up_gpu()
    show_gpu('After composing datasets: ')
    
    trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=NoiseToShakeDataCollator(tokenizer=tokenizer),
        callbacks=[PrintExampleCallback()]
    )
        
    trainer.train()
    
    if verbose:
        output_once_off(model, tokenizer)
    
    return model, tokenizer

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    adafactor=True,
    learning_rate=1e-4
)

In [None]:
model, tokenizer = run_experiment(
    model_base=AutoModelForSeq2SeqLM, 
    tokenizer_base=AutoTokenizer, 
    pretrained_name="t5-base", 
    training_args=training_args, 
    number_training_pairs=20000,
    add_eos_token_to_labels=True, 
    verbose=True
)