<a href="https://colab.research.google.com/github/murdo25/DeepLearningZoo/blob/master/t5_for_poly_expansion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install nlp
!pip install tqd
!pip install transformers
!pip install sentencepiece


Collecting nlp
[?25l  Downloading https://files.pythonhosted.org/packages/09/e3/bcdc59f3434b224040c1047769c47b82705feca2b89ebbc28311e3764782/nlp-0.4.0-py3-none-any.whl (1.7MB)
[K     |▏                               | 10kB 20.1MB/s eta 0:00:01[K     |▍                               | 20kB 26.9MB/s eta 0:00:01[K     |▋                               | 30kB 31.7MB/s eta 0:00:01[K     |▉                               | 40kB 22.4MB/s eta 0:00:01[K     |█                               | 51kB 16.4MB/s eta 0:00:01[K     |█▏                              | 61kB 15.0MB/s eta 0:00:01[K     |█▍                              | 71kB 12.5MB/s eta 0:00:01[K     |█▋                              | 81kB 13.4MB/s eta 0:00:01[K     |█▉                              | 92kB 14.0MB/s eta 0:00:01[K     |██                              | 102kB 12.8MB/s eta 0:00:01[K     |██▏                             | 112kB 12.8MB/s eta 0:00:01[K     |██▍                             | 122kB 12.8MB/s eta

In [9]:
from nlp import Dataset

def clean(data_path):
    open_file = open(data_path, 'r').readlines()

    lines = []
    # Strips the newline character 
    for line in open_file: 
        line = line.strip()
        lines.append(line)
    return lines

def build_dataset(data_file):

    lines = clean(data_file)

    datapoints = {}
    datapoints['input_text']= []
    datapoints['target_text']= []

    for line in lines:

        # Split the first half  and second half
        input_text, target_text = line.split('=')

        # Construct positive example 
        datapoints['input_text'].append(input_text)
        datapoints['target_text'].append(target_text)

    assert len(datapoints['target_text']) == len(datapoints['input_text']), "incorrect data distribution"

    # from nlp import Dataset
    return Dataset.from_dict(datapoints)


In [10]:
import tqdm
import torch
import nlp
from transformers import T5Tokenizer
import json
import dataclasses
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional
import numpy as np

tokenizer = T5Tokenizer.from_pretrained('t5-small')


In [11]:
# tokenize the examples
def convert_to_features(example_batch):
    input_encodings  = tokenizer.batch_encode_plus(example_batch['input_text'], pad_to_max_length=True, max_length=128)
    target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], pad_to_max_length=True, max_length=129)

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }

    return encodings


# train_dataset = build_dataset('half_train_set.txt')
valid_dataset = build_dataset('validation_set_10k.txt')


# map convert_to_features batch wise
# train_dataset = train_dataset.map(convert_to_features, batched=True)
valid_dataset = valid_dataset.map(convert_to_features, batched=True, load_from_cache_file=False)

# set the tensor type and the columns which the dataset should return
columns = ['input_ids', 'target_ids', 'attention_mask', 'target_attention_mask']
# train_dataset.set_format(type='torch', columns=columns)
valid_dataset.set_format(type='torch', columns=columns)

# cach the dataset, so we can load it directly for training

# torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')


from transformers import T5ForConditionalGeneration, T5Tokenizer, EvalPrediction
from transformers import (
    HfArgumentParser,
    DataCollator,
    Trainer,
    TrainingArguments,
    set_seed,
)


logger = logging.getLogger(__name__)

from data_classes import T2TDataCollator, ModelArguments, DataTrainingArguments


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))




In [None]:
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))

    # we will load the arguments from a json file, 
    #make sure you save the arguments in at ./args.json
    model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath('args.json'))

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    model = T5ForConditionalGeneration.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_dataset  = torch.load(data_args.train_file_path)
    valid_dataset = torch.load(data_args.valid_file_path)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=T2TDataCollator(),
    )
    
    # Training
    if training_args.do_train:
        loss = trainer.train(
            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
        )
        print("loss: ", loss)
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))
    
        results.update(eval_output)
    
    return results

args_dict = {
  "num_cores": 8,
  'training_script': 'train_t5_squad.py',
  "model_name_or_path": 't5-small',
  "max_len": 512 ,
  "target_max_len": 16,
  "output_dir": './models/gpu',
  "overwrite_output_dir": True,
  "per_gpu_train_batch_size": 8,
  "per_gpu_eval_batch_size": 8,
  "gradient_accumulation_steps": 4,
  "learning_rate": 1e-4,
  "tpu_num_cores": 8,
  "do_train": True,
  "num_train_epochs": 32
}

with open('args.json', 'w') as f:
  json.dump(args_dict, f)

"""Start training!"""
main()

01/14/2021 06:39:05 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir=./models/gpu, overwrite_output_dir=True, do_train=True, do_eval=None, do_predict=False, evaluation_strategy=EvaluationStrategy.NO, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, gradient_accumulation_steps=4, eval_accumulation_steps=None, learning_rate=0.0001, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=32, max_steps=-1, lr_scheduler_type=SchedulerType.LINEAR, warmup_steps=0, logging_dir=runs/Jan14_06-39-04_575ba0fc85ca, logging_first_step=False, logging_steps=500, save_steps=500, save_total_limit=None, no_cuda=False, seed=42, fp16=False, fp16_opt_level=O1, fp16_backend=auto, local_rank=-1, tpu_num_cores=8, tpu_metrics_debug=False, debug=False, dataloader_drop_last=False, eval_steps=500, dataloader_num_workers=0, past_index=-1, run_name=./models/gpu, disable_tqdm=False, remove_unus

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…

01/14/2021 06:39:05 - INFO - filelock -   Lock 140612341154928 released on /root/.cache/huggingface/transformers/fe501e8fd6425b8ec93df37767fcce78ce626e34cc5edc859c662350cf712e41.406701565c0afd9899544c1cb8b93185a76f00b31e5ce7f6e18bbaef02241985.lock





01/14/2021 06:39:06 - INFO - filelock -   Lock 140612340275352 acquired on /root/.cache/huggingface/transformers/fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…

01/14/2021 06:39:08 - INFO - filelock -   Lock 140612340275352 released on /root/.cache/huggingface/transformers/fee5a3a0ae379232608b6eed45d2d7a0d2966b9683728838412caccc41b4b0ed.ddacdc89ec88482db20c676f0861a336f3d0409f94748c209847b49529d73885.lock





Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
  return function(data_struct)


Step,Training Loss
500,1.3771
1000,0.9891
1500,0.8877
2000,0.8155
2500,0.7618
3000,0.7115
3500,0.671
4000,0.6403
4500,0.6053
5000,0.5849


RuntimeError: ignored

In [13]:
# F1: https://en.wikipedia.org/wiki/F-score

## SQuAD evaluation script. Modifed slightly for this notebook
from data_classes import ModelArguments
from transformers import HfArgumentParser
parser = HfArgumentParser(ModelArguments)

from collections import Counter
import string
import re
import argparse
import json
import sys
import os
import torch
import nlp
from transformers import T5ForConditionalGeneration, T5Tokenizer, set_seed 
from tqdm.auto import tqdm
from os import listdir
set_seed(42)


def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def exact_match_score(prediction, ground_truth):
    return (normalize_answer(prediction) == normalize_answer(ground_truth))


def evaluate(gold_answers, predictions):
    f1 = exact_match = total = 0

    for ground_truth, prediction in zip(gold_answers, predictions):
      total += 1
      exact_match += exact_match_score(prediction, ground_truth)
      f1 += f1_score(prediction, ground_truth)
    
    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}

def clean(result):
    result = result.replace("<pad>","")
    result = result.replace("</s>", "")
    result = result.strip()
    result = result.lower()
    return result

# model_path = "models/gpu/checkpoint-11000"
model_path = "models/gpu/"
checkpoints = "models/gpu/"

print(listdir(checkpoints))

for checkpoint in listdir(checkpoints):
    # print("checkpoint:", checkpoint)
    if(checkpoint.split("-")[0] != "checkpoint"):
        continue
    # if(checkpoint.split("-")[1] != "83000"):
    #     continue

    print(checkpoint.split("-")[1])

    print("checkpoint:", checkpoint)

    model = T5ForConditionalGeneration.from_pretrained(model_path + checkpoint).to('cuda')

    tokenizer = T5Tokenizer.from_pretrained('t5-small')

    valid_dataset = torch.load('valid_data.pt')
    dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=256)

    answers = []
    for batch in dataloader:
      outs = model.generate(input_ids=batch['input_ids'].to('cuda'), 
                            attention_mask=batch['attention_mask'].to('cuda'),
                            max_length=16,
                            early_stopping=True)
      outs = [tokenizer.decode(ids) for ids in outs]
      answers.extend(outs)

    predictions = []
    references = []
    for ref, pred in zip(valid_dataset, answers):
      predictions.append(clean(pred))
      references.append(clean(tokenizer.decode(ref['target_ids'])))

    print(checkpoint, evaluate(references, predictions))

['.ipynb_checkpoints', 'checkpoint-20000']
20000
checkpoint: checkpoint-20000


  return function(data_struct)


checkpoint-20000 {'exact_match': 85.56144385561444, 'f1': 85.56144385561444}


In [None]:
!zip -r checkpoint-83000.zip models/gpu/checkpoint-83000

  adding: models/gpu/checkpoint-83000/ (stored 0%)
  adding: models/gpu/checkpoint-83000/config.json (deflated 63%)
  adding: models/gpu/checkpoint-83000/trainer_state.json (deflated 88%)
  adding: models/gpu/checkpoint-83000/pytorch_model.bin (deflated 9%)
  adding: models/gpu/checkpoint-83000/training_args.bin (deflated 46%)
  adding: models/gpu/checkpoint-83000/optimizer.pt (deflated 7%)
  adding: models/gpu/checkpoint-83000/scheduler.pt (deflated 49%)


In [None]:
!rm checkpoint-83000.zip

In [12]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)

Mounted at /content/gdrive


In [None]:
!cp checkpoint-83000.zip gdrive/MyDrive/colab/

In [None]:
!ls -lt gdrive/MyDrive/colab

ls: cannot access 'gdrive/MyDrive/colab': No such file or directory
