Notebook adapted from:  
https://medium.com/askdata/train-t5-for-text-summarization-a1926f52d281  
https://colab.research.google.com/drive/14_A2kM8sOVpzwHn-0pMbfnD2htzI2Nte

# 0. Set up environment

In [1]:
%%script false  --no-raise-error
# comment out this first line to install the correct version of transformers and datasets
# you may want to create a conda env for this, since this a dev branch

!git clone https://github.com/huggingface/transformers
%mv transformers ../software/transformers
%cd ../software/transformers
!git checkout t5-fp16-no-nans
!pip install . --upgrade
!pip install datasets
%cd ../../notebooks

In [2]:
import os

import datasets
import torch
import numpy as np
import pandas as pd
from sklearn import model_selection
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorWithPadding
from tqdm.notebook import tqdm
from torch import nn

SEED = 2557
os.environ['CUDA_VISIBLE_DEVICES'] = '6,7'

In [3]:
import transformers
print(transformers.__version__)
print(torch.__version__)

4.6.0.dev0
1.8.1


Let's use Weights & Biases for tracking

In [4]:
import wandb
wandb.login()

%env WANDB_LOG_MODEL=true

[34m[1mwandb[0m: Currently logged in as: [33mbryanli[0m (use `wandb login --relogin` to force relogin)


env: WANDB_LOG_MODEL=true


In [5]:
DO_TRAIN = True
DO_EVAL = True

In [6]:
%cd ../glucose/

GLUCOSE_DIR = os.getcwd()
TRAIN_PATH = os.path.join(GLUCOSE_DIR, 'data_final/GLUCOSE_training_data_final.csv')
TEST_PATH = os.path.join(GLUCOSE_DIR, 'data_final/nov27_key_final_copy.csv')

from scripts import format_data
from scripts.train import *

/mnt/nlpgridio3/data/bryanli/projects/stories/glucose


In [7]:
model_size = 't5-large'
# model_size = 't5-base'

suffix = '_' + model_size.split('-')[-1]

exp_num = '0'
EXP_NAME = f'exp{exp_num}{suffix}'
OUTPUT_DIR = f'{GLUCOSE_DIR}/outputs/{EXP_NAME}'
MODEL_DIR =  f'{OUTPUT_DIR}/model'

# Setting 1: Generation
Here, we frame the task as a generation problem.

Let's load the data and format it for our experiments. The options for `exp_num` are:  
'0' : same as the original task
'1' : input = dim + precontext, output = target sentence  
'2a': input = dim + precontext, output = original output (with generalized and contextualized)  
'2b': input = dim + precontext + \<mask_sent\> + postcontext, output = original output

In [8]:
df_train, df_val, ids_val = format_data.format_data(TRAIN_PATH, exp_num, split_val=True, seed=SEED)

df_test, _, _ = format_data.format_data(TEST_PATH, exp_num, split_val=False, seed=SEED, is_test=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
with open(OUTPUT_DIR + '/ids_val.txt', 'w') as f:
    f.writelines([f'{idx}\n' for idx in ids_val])
        
print(f"size of train: {len(df_train)}")
print(f"size of validation: {len(df_val)}")

size of train: 273952
size of validation: 30147


In [9]:
df_val.iloc[0]

experiment_id              a2bb14bd-9094-4dcc-93a5-bb93b1c308a5__4
story_id                      a2bb14bd-9094-4dcc-93a5-bb93b1c308a5
input            #1: my daughter had to do a science project. s...
specific         My daughter has  a  science project >Causes/En...
general          Someone_B (who is Someone_A's child) has a sci...
output           My daughter has a science project >Causes/Enab...
Name: 10, dtype: object

In [10]:
ex = df_train.iloc[0]
print(f'input: {ex["input"]}')
print(f'output: {ex["output"]}')
df_train[['input', 'output']]

input: #1: It was bedtime at our house. Two of the three kids hit the pillow and fall asleep. The third is a trouble maker. For two hours he continues to get out of bed and want to play. *Finally he becomes tired and falls asleep.*
output: The third kid continues to get out of bed and wants to play >Causes/Enables> The kid finally becomes tired and falls asleep ** Someone_A doesn't want to go to sleep >Causes/Enables> Someone_A finally falls asleep


Unnamed: 0,input,output
0,#1: It was bedtime at our house. Two of the th...,The third kid continues to get out of bed and ...
1,#3: It was bedtime at our house. Two of the th...,The third kid is in bed >Enables> The kid fina...
2,#1: Sara was invited to the football game. She...,Sara goes to a football game >Causes/Enables> ...
3,#3: Sara was invited to the football game. She...,Sara is at a football game >Enables> Her team ...
4,#5: Sara was invited to the football game. She...,The game is over >Enables> Sara's team wins **...
...,...,...
304094,#3: Julie was on the couch when she saw her mo...,Julie and her mom are at home >Enables> Julie'...
304095,#4: Julie was on the couch when she saw her mo...,Julie's mom possess(es) sandwich ingredients >...
304096,#6: Julie was on the couch when she saw her mo...,Julie's mom agrees to prepare a sandwich for J...
304097,#7: Julie was on the couch when she saw her mo...,Julie's mom agrees to prepare a sandwich for J...


## Set up wand

In [11]:
if DO_TRAIN:
    wandb.init(project="glucose_final", name=EXP_NAME)

[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


## Tokenization

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_size)

if exp_num == '2b':
    special_tokens_dict = {'additional_special_tokens': ['<mask_sent>']}
    add_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [13]:
ds_train = datasets.Dataset.from_pandas(df_train)
ds_val = datasets.Dataset.from_pandas(df_val)
ds_test = datasets.Dataset.from_pandas(df_test)

In [None]:
max_source, max_target = get_src_tgt_len(df_train['input'], df_train['output'], tokenizer)
print(max_source, max_target)

In [None]:
ds_test

In [None]:
BATCH_SIZE_ENCODE = 512

kwargs = dict(max_source=max_source, max_target=max_target, tokenizer=tokenizer)
ds_train = ds_train.map(encode, batched=True, batch_size=BATCH_SIZE_ENCODE, fn_kwargs=kwargs)
ds_val = ds_val.map(encode, batched=True, batch_size=BATCH_SIZE_ENCODE, fn_kwargs=kwargs)
ds_test = ds_test.map(encode, batched=True, batch_size=BATCH_SIZE_ENCODE, fn_kwargs=kwargs)

# ds_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
# ds_val.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
# ds_test.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
COLS_TO_FORMAT = ['input_ids', 'labels', 'attention_mask']
ds_train.set_format(type='torch', columns=COLS_TO_FORMAT)
ds_val.set_format(type='torch', columns=COLS_TO_FORMAT)
ds_test.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
# verify proper encoding
print(tokenizer.decode(ds_val[0]['input_ids']))
print()
print(tokenizer.decode(ds_val[0]['labels']))

In [None]:
ds_train.save_to_disk(f'{OUTPUT_DIR}/ds_train')
ds_val.save_to_disk(f'{OUTPUT_DIR}/ds_val')
ds_test.save_to_disk(f'{OUTPUT_DIR}/ds_test')

## Load pretrained model

In [None]:
if DO_TRAIN:
    model = T5ForConditionalGeneration.from_pretrained(model_size, cache_dir='/nlp/data/bryanli/.cache')
    if exp_num == '2b':
        model.resize_token_embeddings(len(tokenizer))

    ds_train_shuffled = ds_train.shuffle(seed=SEED)
    ds_val_shuffled = ds_val.shuffle(seed=SEED)

## Finetune

If using `t5-large`, you will need to be on nlpgpu03, which has 24 GB vRAM per GPU. The `t5-large` model needs ~10 GB alone + the size of batches.

With either `t5-large` or `t5-base`, we find that we need to train for far less epochs than described in the paper (which says "500K steps" without clearly specifying what that means) for validation loss to stop decreasing.

In [None]:
if DO_TRAIN:
    trainer = None
    # taken from GLUCOSE and T5 paper when possible
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=12,
        # prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
        evaluation_strategy='steps', # Run evaluation every eval_steps
        save_steps=1000, # How often to save a checkpoint
        logging_steps=1000, # How often to log loss to wandb
        save_total_limit=10, # Number of maximum checkpoints to save
        remove_unused_columns=True, # Removes useless columns from the dataset
        run_name=EXP_NAME, # Wandb run name
        load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
        metric_for_best_model="eval_loss", # Use loss to evaluate best model.
        greater_is_better=False, # Best model is the one with the lowest loss, not highest.
        eval_accumulation_steps=10,
#         fp16=True # doesn't work for t5-large yet
    )
    optimizer = transformers.Adafactor(model.parameters(), lr=0.0001,
                                   relative_step=False, warmup_init=False, scale_parameter=False,
                                   decay_rate=0.0, clip_threshold=1.0)
    scheduler = None
    
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
    print("padding to multiple of 8 for fp16" if data_collator else "not fp16")
    
#     transformers.logging.set_verbosity_info()
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train_shuffled,
        eval_dataset=ds_val_shuffled,
        optimizers=(optimizer, scheduler)
    )
    trainer.add_callback(transformers.EarlyStoppingCallback(5, ))
    
    trainer.args._n_gpu = 2
    trainer.train()
    trainer.save_model(MODEL_DIR)

In [None]:
# helper for freeing GPU memory
torch.cuda.empty_cache()

# Evaluation

In [None]:
MODEL_DIR =  f'{OUTPUT_DIR}/model'
MODEL_DIR =  f'{OUTPUT_DIR}/checkpoint-10000'
model_ft = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
model_ft = model_ft.cuda()

In [None]:
ds_val.set_format(type='torch', columns=COLS_TO_FORMAT, device='cuda')

In [None]:
print(generate_from_sentence(model_ft, tokenizer, "#1: I went for a steak dinner. I invited my roommate."))
print(generate_from_sentence(model_ft, tokenizer, "#6: I went for a steak dinner. I invited my roommate."))
preds = generate_from_dataset(model_ft, tokenizer, ds_val, batch_size=128)
preds_decoded = decode_seqs(preds, tokenizer, True)

In [None]:
if exp_num == '2b':
    sources_decoded = decode_seqs(ds_val['input_ids'], False)
    sources_decoded = [x.split('</s>', 1)[0] for x in sources_decoded]
else:
    sources_decoded = decode_seqs(ds_val['input_ids'], True)
labels_decoded = decode_seqs(ds_val['labels'], True)

output = pd.DataFrame({'input': sources_decoded, 'output_true': labels_decoded, 'output_pred': preds_decoded, 'target': df_val['target']})
output.to_csv(OUTPUT_DIR + "/predictions_val.csv")

In [None]:
NUM_TO_PRINT = 5
for i in range(NUM_TO_PRINT):
    ex = output.iloc[i*100]
    print(f'EX {i*100}')
    print('INPUT:  ', ex['input'], '\n')
    print('GOLD:   ', ex['output_true'], '\n')
    print('PRED:   ', ex['output_pred'], '\n')
    print('TARGET: ', ex['target'])
    print('-' * 50, '\n')