Notebook adapted from:  
https://medium.com/askdata/train-t5-for-text-summarization-a1926f52d281  
https://colab.research.google.com/drive/14_A2kM8sOVpzwHn-0pMbfnD2htzI2Nte

# 0. Set up environment

In [1]:
import os
import torch
import numpy as np
import pandas as pd

from sklearn import model_selection
from torch import nn

from transformers import AutoTokenizer
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
import datasets

SEED = 2557
EXP_NAME = 'exp1'

In [2]:
%%script false  --no-raise-error
!pip install transformers
!pip install datasets

Let's use Weights & Biases for tracking

In [3]:
import wandb
wandb.login()

%env WANDB_LOG_MODEL=true

[34m[1mwandb[0m: Currently logged in as: [33mbryanli[0m (use `wandb login --relogin` to force relogin)


env: WANDB_LOG_MODEL=true


In [4]:
%cd ../glucose/

GLUCOSE_DIR = os.getcwd()
TRAIN_PATH = os.path.join(GLUCOSE_DIR, 't5_data/t5_training_data.tsv')
TEST_PATH = os.path.join(GLUCOSE_DIR, 't5_data/t5_test_data.txt')

/mnt/nlpgridio3/data/bryanli/projects/stories/glucose


In [5]:
T5_HEADER = ['input', 'output']
df_train_orig = pd.read_csv(TRAIN_PATH, sep='\t', names=T5_HEADER)
df_test_orig = pd.read_csv(TEST_PATH, sep='\t', names=T5_HEADER)
df_train_orig['input'] = '#' + df_train_orig['input']

# Data Preprocessing


In [None]:
def get_story_ids(story_col):
    stories = story_col.unique()
    story2id = {story: i for i, story in enumerate(stories)}
    return story_col.map(story2id)

def make_df(X_input):
    '''
    Creates an intermediate df, used for later formatting of input/output. Assigns a unique `story_id` to each story 
    
    Args:
        X_input (pd.Series): input field of T5 GLUCOSE dataset
    '''
    X_split = X_input.str.split(': ', 1, expand=True)
    dim, story = X_split[0], X_split[1]
    selected_split = story.str.split('*', 2, expand=True)
    story_before, target_sentence, story_after = selected_split[0], selected_split[1], selected_split[2]
    story = story_before + target_sentence + story_after
    story_id = get_story_ids(story)
    d = {'dim': dim, 'story_before': story_before, 'target': target_sentence, 'story_after': story_after, 'story': story, 'story_id': story_id}
    df = pd.DataFrame(d)
    return df

In [None]:
df_train = make_df(df_train_orig['input'])

Next, we split the dataset into train/val sets. We ensure that stories are not shared between the splits by randomly selecting 10% of `story_id` fields for validation.

In [None]:
story_ids = df_train['story_id'].unique()
ids_train, ids_val = model_selection.train_test_split(story_ids, test_size=.1, random_state=SEED)
df_train1 = df_train[df_train['story_id'].isin(ids_train)]
df_val1 = df_train[df_train['story_id'].isin(ids_val)]

# Experiment 1: Generation
Here, we frame the task as a generation problem.

In [None]:
def get_in_out_df(df):
    # for next sentence task, we exclude cases where there are no sentences before or after
    df = df[(df['story_before'] != '') & (df['story_after'] != '')].reset_index()
    df['input'] = df['dim'] + ': ' + df['story_before'].str.strip()
    df['output'] = df['target']
    return df

The task set up is  
input = #<dim\>: <story up to the target sentence\>  
output = <next sentence\> 

In [None]:
df_train1 = get_in_out_df(df_train1)
df_val1 = get_in_out_df(df_val1)
df_train1 = df_train1.sample(frac=1, random_state=SEED)
df_val1 = df_val1.sample(frac=1, random_state=SEED)

In [None]:
df_train1

## Set up wand

In [None]:
WANDB_NAME = f'glucose_{EXP_NAME}'
wandb.init(name=WANDB_NAME)

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')

TOK_SAVE_DIR = f"{GLUCOSE_DIR}/t5_data/tokenized/"

In [None]:
ds_train = datasets.Dataset.from_pandas(df_train1)
ds_val = datasets.Dataset.from_pandas(df_val1)

In [None]:
def get_src_tgt_len(source_text, target_text):
    tokenized_source_text = tokenizer(list(source_text), truncation=False, padding=False)
    tokenized_target_text = tokenizer(list(target_text), truncation=False, padding=False)

    max_source = 0
    for item in tokenized_source_text['input_ids']:
        if len(item) > max_source:
            max_source = len(item)

    max_target = 0
    for item in tokenized_target_text['input_ids']:
        if len(item) > max_target:
            max_target = len(item)
    return max_source, max_target

max_source, max_target = get_src_tgt_len(df_train1['input'], df_train1['output'])
print(max_source, max_target)

In [None]:
# %%script false --no-raise-error

def encode(batch):
    inp = tokenizer(batch['input'], padding='max_length', truncation=True, max_length=max_source)
    outp = tokenizer(batch['output'], padding='max_length', truncation=True, max_length=max_target)
    inp['labels'] = outp['input_ids']
    return inp

BATCH_SIZE_ENCODE = 512

ds_train = ds_train.map(encode, batched=True, batch_size=BATCH_SIZE_ENCODE)
ds_val = ds_val.map(encode, batched=True, batch_size=BATCH_SIZE_ENCODE)

ds_train.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
ds_val.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])

# ds_train.save_to_disk(f'{TOK_SAVE_DIR}/train')
# ds_val.save_to_disk(f'{TOK_SAVE_DIR}/val')


In [None]:
%%script false --no-raise-error 
ds_train = datasets.load_from_disk(f'{TOK_SAVE_DIR}/train')
ds_val = datasets.load_from_disk(f'{TOK_SAVE_DIR}/val')

In [None]:
COLS_TO_FORMAT = ['input_ids', 'labels', 'attention_mask']
ds_train.set_format(type='torch', columns=COLS_TO_FORMAT)
ds_val.set_format(type='torch', columns=COLS_TO_FORMAT)

## Load pretrained model

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

In [None]:
# os.environ["WANDB_WATCH"] = "false"
os.environ['CUDA_VISIBLE_DEVICES'] = '5,6,7'

OUTPUT_DIR = f'{GLUCOSE_DIR}/outputs/{EXP_NAME}'
MODEL_DIR =  f'{OUTPUT_DIR}/model'

## Finetune

In [None]:
%%capture cap

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=2,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=24,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    # prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.0001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=4, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name=EXP_NAME, # Wandb run name
    logging_steps=1000, # How often to log loss to wandb
    eval_steps=1000, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False, # Best model is the one with the lowest loss, not highest.
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_val,

)
trainer.args._n_gpu = 3
trainer.train()
trainer.save_model(MODEL_DIR)

In [None]:
print(cap)

In [None]:
ds_val

# Evaluation

In [None]:
MODEL_DIR =  f'{OUTPUT_DIR}/model_lr1e-4'
model_ft = T5ForConditionalGeneration.from_pretrained(MODEL_DIR)
# model_pre = T5ForConditionalGeneration.from_pretrained('t5-base')

In [None]:
pred_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_eval_batch_size=8,
    remove_unused_columns=True,
    eval_accumulation_steps=1
)

trainer = Trainer(model=model_ft, args=pred_args)

preds, labels, *_ = trainer.predict(ds_val)
preds_tokens = preds.argmax(axis=2)

decoded_sources = []
for row in val_dataset:
    print('hi')
    decoded_sources.append(tokenizer.decode(row['input_ids']))

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_excel(output_dir + "/predictions.xlsx")

In [None]:
preds_tokens = preds[0].argmax(axis=2)


decoded_sources = []
for row in ds_val:
    print('hi')
    decoded_sources.append(tokenizer.decode(row['input_ids']))



In [None]:

decoded_preds = [tokenizer.decode(pred) for pred in preds_tokens]
decoded_labels = [tokenizer.decode(label) for label in labels]

output = pd.DataFrame({'Source Text': decoded_sources, 'Target Text': decoded_labels, 'Generated Text': decoded_preds})
output.to_excel(OUTPUT_DIR + "/predictions.xlsx")

In [None]:
decoded_labels