<a href="https://colab.research.google.com/github/ngdodd/transformers/blob/master/ngdodd_cse576_synthetic_data_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install dependencies

In [None]:
!pip install fastBPE regex requests sacremoses subword_nmt omegaconf hydra-core datasets==1.1.2

Collecting fastBPE
  Downloading https://files.pythonhosted.org/packages/e1/37/f97181428a5d151501b90b2cebedf97c81b034ace753606a3cda5ad4e6e2/fastBPE-0.1.0.tar.gz
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 12.2MB/s 
[?25hCollecting subword_nmt
  Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl
Collecting omegaconf
  Downloading https://files.pythonhosted.org/packages/e5/f6/043b6d255dd6fbf2025110cea35b87f4c5100a181681d8eab496269f0d5b/omegaconf-2.0.5-py3-none-any.whl
Collecting hydra-core
[?25l  Downloading https://files.pythonhosted.org/packages/f0/1f/7f502b9e37596164111655861370b08626f46f9e4524433c354f472765d4/hydra_core-1.0.4-py3-none-any.whl (122kB)
[K     |████████████████████████████████| 

Python script for conversion and backtranslation. Run this to make the functions available.

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 23 14:37:21 2020

@author: nickg
"""
import os
import json
import torch
import argparse
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
#from transformers import MarianMTModel, MarianTokenizer

# Global backtranslation lambda for backtranslating a given string from src->dst->src
en2de = None # Placeholder for the English->German model from fairseq: https://github.com/pytorch/fairseq
de2en = None # Placeholder for the German->English model from fairseq: https://github.com/pytorch/fairseq
backtranslate = lambda txts : de2en.translate(en2de.translate(txts))

#dest_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')
#dest_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')
#src_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ROMANCE-en')
#src_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ROMANCE-en')

#def translate(txts, model, tokenizer, dest="de"):    
#    tokenized_txts = tokenizer.prepare_seq2seq_batch([f"{txt}" if dest=="en" else f">>{dest}<< {txt}" for txt in txts])
#    return tokenizer.batch_decode(model.generate(**tokenized_txts), skip_special_tokens=True)

#def backtranslate(txts, src="en", dest="de"):
#    translated_txts = translate(txts, dest_model, dest_tokenizer, dest=dest)
#    return translate(translated_txts, src_model, src_tokenizer, dest=src)

# Initialize NMT models from the "Understanding Back-Translation at Scale" paper
def init_bt_models():
    global en2de
    global de2en

    print("\nBuilding English->German model, this may take a couple of minutes...")
    en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', tokenizer='moses', bpe='fastbpe')
  
    print("\nBuilding German->English model, this may take a couple of minutes...")
    de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en.single_model', tokenizer='moses', bpe='fastbpe')
  
    if torch.cuda.is_available():
        print("CUDA enabled for NMT backtranslation models...")
        en2de.cuda()
        de2en.cuda()

# Apply backtranslation to the question, context, and answer options of a single quail entry
def backtranslate_quail_entry(entry):
    entry['question'] = backtranslate([entry['question']])
    entry['context'] = backtranslate([entry['context']])
    entry['answers'] = backtranslate(entry['answers'])
    
# Write a single json'd data entry to file and apply backtranslation if configured to do so
def write_jsonl_entry(entry, jsonl_file):
    json.dump(entry, jsonl_file)
    jsonl_file.write('\n')

# Backtranslate the dataset stored in f by batching and serving to fairseq models
# for NMT inference. Convert dataset entries accordingly and save to file.
def backtranslate_dataset(path, split, batch_size, bt_output_file, n_samples):
    print("Loading data to backtranslate from {}...".format(path))
    n_batches = n_samples//batch_size
    print("\nBacktranslating {} samples in {} split using {} batches of size {}. Ignoring {} samples...".format(n_samples, split, n_batches, batch_size, n_samples-(n_batches*batch_size)))
    dataset = load_dataset('json', data_files='cosmos_qa/cosmos_qa_train.jsonl', split=[f"{split}[{batch_size*k}:{batch_size*(k+1)}]" for k in range(n_batches)])
    #dataset = load_dataset('json', data_files=path, split=[f"{split}[{batch_size*k}:{batch_size*(k+1)}]" for k in range(n_batches)])
    for batch in tqdm(dataset):
        bt_question_batch = backtranslate([entry['question'] for entry in batch])
        bt_context_batch = backtranslate([entry['context'] for context in batch])
        bt_answers_batch = backtranslate([ans_option for ans_option in entry['answers'] for entry in batch])
        bt_answers_batch = [bt_answers_batch[k:k+4] for k in range(0, len(bt_answers_batch), 4)] # Regroup answer batch

        # Iterate over the backtranslated batch and write entries to file
        for itr, entry in enumerate(batch):
            entry['question'] = bt_question_batch[itr]
            entry['context'] = bt_context_batch[itr]
            entry['answers'] = bt_answers_batch[itr]
            json.dump(entry, bt_output_file)
            bt_output_file.write('\n')
    
# A two-for-one data formatter for both swag and hellaswag datasets.
# Splits for swag: train, val
# Splits for hellaswag: train, validation
def swag2quail(split, with_backtranslation, prefix=""):
    is_hella = prefix=="hella"
    itr_container = pd.read_csv("swag/{}.csv".format(split)).iterrows() if not is_hella else enumerate(load_dataset('hellaswag')[split]) 
    elem_indices = ['source_id', 'ctx_a', 'ctx_b', 'label'] if is_hella else [2, 4, 5, 11]
    ending_funct = lambda e, k : e['endings'][k] if is_hella else e['ending{}'.format(k)]
    
    path = "swag/{}swag_{}.jsonl".format(prefix, split)
    bt_path = "swag/{}swag_{}_backtranslated.jsonl".format(prefix, split)
    with open(path, mode='w', encoding='utf-8') as f, open(bt_path, mode='w', encoding='utf-8') as bt_f:
        for swag_entry in tqdm(itr_container):
            quail_entry = {"id": swag_entry[elem_indices[0]],
                           "context": swag_entry[elem_indices[1]],
                           "question": swag_entry[elem_indices[2]],
                           "question_type": 'Subsequent_state',
                           "answers": [ending_funct(swag_entry,k) for k in range(4)],
                           "correct_answer_id": swag_entry[elem_indices[-1]] }
            write_jsonl_entry(quail_entry, f, bt_f, with_backtranslation)
         
# Convert cosmos_qa to quail format. Questions for which the correct answer contains
# "None of the above" are unanswerable questions in this dataset.
# Splits: train, validation
def cosmos2quail(split, with_backtranslation, batch_size=64):
    is_unanswerable = lambda e : "None of the above" in e["answer{}".format(e['label'])]
    cosmos = load_dataset('cosmos_qa')[split]

    path = "cosmos_qa/cosmos_qa_{}.jsonl".format(split)
    bt_path = "cosmos_qa/cosmos_qa_{}_backtranslated.jsonl".format(split)
    with open(path, mode='w', encoding='utf-8') as f, open(bt_path, mode='w', encoding='utf-8') as bt_f:
        print("Converting data to quail format...")
        for cosmos_entry in tqdm(cosmos):
            quail_entry = {"id": cosmos_entry['id'],
                           "context": cosmos_entry['context'],
                           "question": cosmos_entry['question'],
                           "question_type": 'Unanswerable' if is_unanswerable(cosmos_entry) else 'Causality',
                           "answers": [cosmos_entry['answer{}'.format(k)] for k in range(4)],
                           "correct_answer_id": cosmos_entry['label'] }
            write_jsonl_entry(quail_entry, f)
        if with_backtranslation:
              backtranslate_dataset(path, split, batch_size, bt_f, len(cosmos))
    
def process_dataset(dataset, split, with_backtranslation=False):
    print("Processing {}[{}]{}...".format(dataset, split, " and applying backtranslation" if with_backtranslation else ''))
    if with_backtranslation:
        print("Initializing backtranslation models...")
        init_bt_models()

    if 'swag' in dataset:      
        swag2quail(split, with_backtranslation, dataset.split('swag')[0])
    elif 'cosmos_qa' in dataset:
        cosmos2quail(split, with_backtranslation)
    else:
        print("Unknown dataset: {}".format(dataset))

Get quail formatted cosmos_qa datasets and apply backtranslation:

In [None]:
!mkdir cosmos_qa
process_dataset(dataset='cosmos_qa', split='train', with_backtranslation=True)
process_dataset(dataset='cosmos_qa', split='validation', with_backtranslation=True)

Sandbox

In [None]:
batch_size=64
n_batches=394
split='train'
dataset = load_dataset('json', data_files='cosmos_qa/cosmos_qa_train.jsonl', split=[f"{split}[{batch_size*k}:{batch_size*(k+1)}]" for k in range(n_batches)])

Using custom data configuration default


Downloading and preparing dataset json/default-ea1472831332e1f0 (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /root/.cache/huggingface/datasets/json/default-ea1472831332e1f0/0.0.0/fb88b12bd94767cb0cc7eedcd82ea1f402d2162addc03a37e81d4f8dc7313ad9...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-ea1472831332e1f0/0.0.0/fb88b12bd94767cb0cc7eedcd82ea1f402d2162addc03a37e81d4f8dc7313ad9. Subsequent calls will reuse this data.
