In [1]:
from platform import python_version

python_version()

'3.9.1'

In [2]:
import evaluate
import os
import re
import string
import torch
import unicodedata

In [3]:
import gradio as gr
import numpy as np
import pandas as pd

In [4]:
from transformers import AdamW, get_scheduler
from transformers import RealmRetriever, RealmTokenizerFast, RealmForOpenQA, RealmConfig

from transformers.models.realm.retrieval_realm import convert_tfrecord_to_np

In [5]:
from datasets import load_dataset, get_dataset_infos
from pathlib import Path
from tqdm.auto import tqdm
from typing import List

In [6]:
from torch.utils.data import DataLoader

In [9]:
wiki_data = load_dataset("wikipedia", "20220301.en")

Downloading and preparing dataset wikipedia/20220301.en to /home/patrick/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/20.3G [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /home/patrick/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
wiki_data

In [8]:
sample_docs = pd.read_json('data/wiki-raw/enwiki-20211220/AD/wiki_00', lines=True)
print(f"Before filter: {sample_docs.describe()}")

# we want to filter and keep only docs that have non-empty text fields.
sample_docs = sample_docs[sample_docs['text'].str.len() > 0]
sample_docs.describe()
print(f"After filter: {sample_docs.describe()}")

Before filter:                  id         revid
count     73.000000  7.300000e+01
mean   32669.767123  1.282680e+08
std       30.284540  3.286560e+08
min    32616.000000  3.200500e+04
25%    32645.000000  3.461395e+06
50%    32672.000000  9.784415e+06
75%    32695.000000  3.533295e+07
max    32720.000000  1.060874e+09
After filter:                  id         revid
count     61.000000  6.100000e+01
mean   32669.737705  1.527047e+08
std       30.090365  3.548528e+08
min    32616.000000  3.200500e+04
25%    32644.000000  5.280263e+06
50%    32673.000000  1.241690e+07
75%    32695.000000  3.962001e+07
max    32718.000000  1.060874e+09


In [7]:
# We need to go from a dataframe to an array of text
def get_documents(path: str) -> List[str]:
    df = pd.read_json(path_or_buf=path, lines=True)

    # check that it's read in
    print(f"Read in: {len(df)} lines from jsonl file: {path}")
    print(f"Dataframe columns: {df.columns}")

    # filter rows with empty text
    try:
        df = df[df['text'].str.len() > 0]
        print(f"After filtering rows with empty text, df size: {len(df)}")
    except KeyError as ke:
        return []

    # return only the "text" column to be used with the retriever
    return df.text.to_list()

In [8]:
# for each file in the directory, we need to pull its content
def get_directory_contents(dir_name: str):
    dir_data = list()
    for sub_dir_name in os.listdir(dir_name):
        files = os.listdir(os.path.join(dir_name + '/' + sub_dir_name))
        for _file in files:
            dir_data += get_documents(os.path.join(dir_name + '/' + sub_dir_name+'/' + _file))
    return dir_data


# return all evidence texts as a numpy ndarray
def get_evidence_text():
    return np.asarray(get_directory_contents("data/wiki-raw/sample-enwiki-20211220"))

In [None]:
evidence_texts = get_evidence_text()


In [9]:

# we need to save this an dopen it again, for memory saving purposes
np.savetxt('data/wiki-raw/database/evidence_texts.txt', evidence_texts, fmt='%s')

In [12]:
print(evidence_texts.shape)

(15832,)


In [9]:
# Read in sample database texts if they've already been processed
evidence_texts = np.loadtxt('data/wiki-raw/database/evidence_texts.txt', dtype=str, delimiter='\n')

In [10]:
print(evidence_texts.shape)

(852051,)


In [9]:
# Is it possible to initialize a realm retriever with a block of texts?

# 1. Let's find some texts
evidence_texts = np.asarray(sample_docs.text.to_list())

In [10]:
evidence_texts.shape

(61,)

In [11]:
# 2. Retriever from sample evidence texts
tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-encoder")

retriever = RealmRetriever(evidence_texts, tokenizer)

In [12]:
model = RealmForOpenQA.from_pretrained("google/realm-cc-news-pretrained-encoder", retriever=retriever)


Some weights of the model checkpoint at google/realm-cc-news-pretrained-encoder were not used when initializing RealmForOpenQA: ['realm.encoder.layer.1.attention.output.dense.bias', 'realm.encoder.layer.5.attention.output.dense.weight', 'realm.encoder.layer.7.attention.output.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.bias', 'realm.encoder.layer.0.attention.self.key.bias', 'realm.encoder.layer.8.attention.self.value.weight', 'realm.encoder.layer.10.attention.self.value.weight', 'realm.encoder.layer.0.attention.output.dense.weight', 'cls.predictions.bias', 'realm.encoder.layer.0.attention.self.value.bias', 'realm.encoder.layer.6.attention.self.value.weight', 'realm.encoder.layer.6.attention.self.value.bias', 'realm.encoder.layer.0.output.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'realm.encoder.layer.6.attention.self.key.weight', 'realm.embeddings.LayerNorm.weight', 'realm.encoder.layer.9.attention.self.key.bias', 'realm.encoder.layer.10.intermediate.dense.w

In [9]:
# Load (natural questions) dataset for use during fine-tuning the model

def load_natural_questions():
    MAX_TOKEN_SIZE = 5
    def filter_fn(example):
        """
            Remove answers with length > 5
        """
        for short_ans in example['annotations.short_answers']:
            if len(short_ans) != 0:
                for i in range(len(short_ans['text'])):
                    if short_ans['end_token'][i] - short_ans['start_token'][i] < MAX_TOKEN_SIZE:
                        return True
        return False

    def map_fn(example):
        """ Unify dataset structures """
        return {
            'question': example['question.text'],
            'answers': [answer['text'] for answer in example['annotations.short_answers']]
        }

    dataset = load_dataset("natural_questions",
                           # cache_dir="/run/user/1000/gvfs/smb-share:server=thecloud.local,share=elements_25a3-1",  # NAS device, I don't have enough space locally
                           beam_runner='DirectRunner')  # use default cache location

    # get rid of unused columns and flatten the structure
    train_dev_set = dataset['train'].train_test_split(test_size=0.2, shuffle=False)

    # create train/test/val splits
    train_set = train_dev_set['train'].remove_columns(['id', 'document']).flatten()
    test_set = train_dev_set['test'].remove_columns(['id', 'document']).flatten()
    eval_set = dataset['validation'].remove_columns(['id', 'document']).flatten()

    # Perform filtering and mapping
    train_set_filtered = train_set.filter(filter_fn).map(map_fn)
    test_set_filtered = test_set.filter(filter_fn).map(map_fn)
    eval_set_filtered = eval_set.filter(filter_fn).map(map_fn)

    # After this, an example in this dataset should contain the following columns:
    #     example['question']
    #     example['answers'][num_answers]

    return train_set_filtered, test_set_filtered, eval_set_filtered

In [9]:
# function to load zsRE qa dataset
def load_zsre():
    MAX_TOKEN_SIZE = 5

    def set_subject_text(example):
        """
        Put the subject of the context into the text
            Example: {'question': "What team does XXX belong to?",
                      ...
                      'subject': 'Travis Harmonic'
                      'answers': ['New York Islanders']
                      }
            Into: {'question': "What team does Travis Harmonic belong to?",
                   'subject': 'Travis Harmonic'
                   'answers': ['New York Islanders']}
        """
        return {'question': example['question'].replace('XXX', example['subject']),
                'answers': example['answers']}

    def filter_empty_ans(example):
        """
        Filter questions that have no answers
        """
        return len(example['answers']) > 0

    dataset = load_dataset("qa_zre")  # cache_dir='/tmp/'

    print(f"{get_dataset_infos('qa_zre')}")

    # remove unused columns
    train_set = dataset['train'].remove_columns(['relation', 'context']).flatten()
    test_set = dataset['test'].remove_columns(['relation', 'context']).flatten()
    eval_set = dataset['validation'].remove_columns(['relation', 'context']).flatten()

    # map cleaning (we remove the subject column after fixing the answers)
    train_filtered = train_set.filter(filter_empty_ans).map(set_subject_text).remove_columns(['subject'])
    test_filtered = test_set.filter(filter_empty_ans).map(set_subject_text).remove_columns(['subject'])
    eval_filtered = eval_set.filter(filter_empty_ans).map(set_subject_text).remove_columns(['subject'])

    # Final dataset should look like:
    #  {'question': 'some text',
    #   'answers': [number of answers]

    return train_filtered, test_filtered, eval_filtered # there's no inflection point t the reward

In [10]:
# Define a data collator to use with model
# fixme: prefer torch.utils.data.DataCollator
class DataCollator(object):
    def __init__(self, _tokenizer):
        # self.args = args
        self.tokenizer = _tokenizer

    def __call__(self, batch):
        example = batch[0]
        question = example['question']
        answer_texts = []

        # format the answers properly
        for ans in example['answers']:
            answer_texts += [ans] if isinstance(ans, str) else ans

        # remove duplicates
        answer_texts = list(set(answer_texts))
        if len(answer_texts) != 0:
            answer_ids = self.tokenizer(
                answer_texts,
                add_special_tokens=False,
                return_token_type_ids=False,
                return_attention_mask=False,
            ).input_ids
        else:
            answer_ids = [[-1]]

        return question, answer_texts, answer_ids



In [50]:
# get natural questions dataset for fine-tuning
train_set, test_set, eval_set = load_zsre()
print(f"Train size: {len(train_set)}\nTest size: {len(test_set)}\nValidation size: {len(eval_set)}")

Found cached dataset qa_zre (/home/patrick/.cache/huggingface/datasets/qa_zre/default/0.1.0/9ad49793a90e5078eb59cf29a88c0b0e893635bbb797a054ac70ab55165bf453)


  0%|          | 0/3 [00:00<?, ?it/s]



{'default': DatasetInfo(description='A dataset reducing relation extraction to simple reading comprehension questions\n', citation='@inproceedings{levy-etal-2017-zero,\n    title = "Zero-Shot Relation Extraction via Reading Comprehension",\n    author = "Levy, Omer  and\n      Seo, Minjoon  and\n      Choi, Eunsol  and\n      Zettlemoyer, Luke",\n    booktitle = "Proceedings of the 21st Conference on Computational Natural Language Learning ({C}o{NLL} 2017)",\n    month = aug,\n    year = "2017",\n    address = "Vancouver, Canada",\n    publisher = "Association for Computational Linguistics",\n    url = "https://www.aclweb.org/anthology/K17-1034",\n    doi = "10.18653/v1/K17-1034",\n    pages = "333--342",\n}\n', homepage='http://nlp.cs.washington.edu/zeroshot', license='', features={'relation': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'subject': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'answers': Sequence(feature=Valu

  0%|          | 0/8400 [00:00<?, ?ba/s]

  0%|          | 0/4200000 [00:00<?, ?ex/s]

  0%|          | 0/120 [00:00<?, ?ba/s]

  0%|          | 0/60000 [00:00<?, ?ex/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

Train size: 4200000
Test size: 60000
Validation size: 3000


In [39]:
# Let's work with the pretrained version of the model
def get_pretrained_realm():
    # set activation function to use
    config = RealmConfig(hidden_act='gelu_new')
    tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-embedder", do_lower_case=True)

    # load the new wiki data converted to tf_record blocks to use with the retriever
    block_records = convert_tfrecord_to_np("/home/patrick/realm/data/wiki-raw/database/blocks.tfr",
                                           # num_block_records=config.num_block_records)  # default value is too big for local machine. Uncomment when running on instance
                                           num_block_records=50000)
    print(f"Size of block records: {len(block_records)}")
    _retriever = RealmRetriever(block_records, tokenizer)

    _model = RealmForOpenQA(config=config, retriever=_retriever)

    return _model


In [None]:
# test convert tfrecord function
block_records = convert_tfrecord_to_np("/home/patrick/realm/data/wiki-raw/database/blocks.tfr", num_block_records=50)
print(block_records)
del block_records

In [14]:
def get_realm_for_openqa(config=None):
    if config is None:
        config = RealmConfig(hidden_act='gelu_new')

    _retriever = RealmRetriever.from_pretrained('google/realm-orqa-nq-openqa')
    _model = RealmForOpenQA.from_pretrained('google/realm-orqa-nq-openqa', retriever=_retriever, config=config)

    _model.eval()

    return _model


In [40]:
# Run on zsre data
# model = get_realm_for_openqa()
model = get_pretrained_realm()

Size of block records: 50000


In [30]:
# what's the default value for num_block_records?
RealmConfig().num_block_records
# this is too big for

13353718

In [41]:
retriever = model.retriever
tokenizer = model.retriever.tokenizer

In [24]:
tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-embedder", do_lower_case=True)
tokenizer("What is autism?", return_tensors='pt')

{'input_ids': tensor([[  101,  2054,  2003, 19465,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [52]:
# let's test out our model
#
question = "Who was the Upper Canada rebellion leader?"
question_ids = tokenizer([question], return_tensors='pt')

answer="W L Mackenzie"

answer_ids = tokenizer([answer],
                       add_special_tokens=False,
                       return_token_type_ids=False,
                       return_attention_mask=False).input_ids
answer_ids

[[1059, 1048, 11407]]

In [53]:
reader_output, predicted_ans_ids = model(**question_ids,
                                         answer_ids=answer_ids,
                                         return_dict=False)


In [56]:

predicted_answer = tokenizer.decode(predicted_ans_ids)
print(f"Ans: {predicted_answer}")
reader_output.loss.backward()

Ans: thus enabling their integration within the wider society from


In [57]:
reader_output.loss

tensor(0., grad_fn=<MeanBackward0>)

In [59]:

data_collator = DataCollator(_tokenizer=tokenizer)

train_data_loader = DataLoader(dataset=train_set, batch_size=1, shuffle=True, collate_fn=data_collator)
eval_data_loader = DataLoader(dataset=eval_set, batch_size=1, shuffle=False, collate_fn=data_collator)

# Define a training params, followed by the training loop
num_epochs = 2
num_training_steps = num_epochs * len(train_data_loader)

optim = AdamW(model.parameters(), lr=5e-3)

lr_scheduler = get_scheduler("linear",
                             optimizer=optim,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

progress_bar = tqdm(range(num_training_steps))


  0%|          | 0/8400000 [00:00<?, ?it/s]

In [60]:
# Normalize answer text strings for metric computation
def normalize_answer(text: str):
    """
    Normalize answer
    """
    text = unicodedata.normalize('NFD', text)

    def remove_articles(t: str):
        return re.sub(r"\b(a|an|the)\b", " ", t)

    def white_space_fix(t: str):
        return " ".join(t.split())

    def remove_punct(t: str):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in t if ch not in exclude)

    def lower(t: str):
         return t.lower()

    return white_space_fix(remove_articles(remove_punct(lower(text))))

In [61]:
# test normalize text function

print(f"Normalize: 'a JambalaYa' => '{normalize_answer('JambalaYa')}'")

Normalize: 'a JambalaYa' => 'jambalaya'


In [62]:
# Let's define some metrics training
def compute_metrics(labels, predicted_ans, _reader_output):
    # Exact match
    em = torch.index_select(torch.index_select(_reader_output.reader_correct,
                                               dim=0,
                                               index=_reader_output.block_idx),
                            dim=1,
                            index=_reader_output.candidate)

    def _true_em(_predicted_ans, references):
        return torch.tensor(max([normalize_answer(_predicted_ans) == normalize_answer(ref) for ref in references]))

    true_em = _true_em(predicted_ans, labels)

    eval_metric = dict(exact_match=em[0][0],
                       official_exact_match=true_em,
                       reader_oracle=torch.any(_reader_output.reader_correct))

    # Top k results
    for k in (5, 10, 20, 50):
        eval_metric[f"top_{k}_match"] = torch.any(_reader_output.retriever_correct[:k])

    return eval_metric



In [12]:


model = get_realm_for_openqa()
retriever = model.retriever
tokenizer = model.tokenizer



NameError: name 'get_realm_for_openqa' is not defined

In [64]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

model.to(device)


Device: cuda


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 14.56 GiB total capacity; 198.80 MiB already allocated; 21.50 MiB free; 202.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# define train loop
for epoch in range(num_epochs):
    # set to train mode
    model.train()

    for batch in train_data_loader:
        optim.zero_grad()

        question, answer_texts, answer_ids = batch

        question_ids = tokenizer(question, return_tensors='pt')
        reader_output, predicted_ans_ids = model(input_ids=question_ids.to(device),
                                                 answer_ids=answer_ids,
                                                 return_dict=False)

        predicted_answer = tokenizer.decode(predicted_ans_ids)

        reader_output.loss.backward()

        # clip gradients?

        optim.step()
        lr_scheduler.step()
        # print(f"Epoch: ")
        progress_bar.update(1)

    model.eval()

    metrics = []

    for batch in tqdm(eval_data_loader):
        question, ans_texts, ans_ids = batch

        question_ids = tokenizer(question, return_tensors='pt').input_ids
        with torch.no_grad():
            outputs = model(input_ids=question_ids.to(device),
                            answer_ids=ans_ids,
                            return_dict=True)

            pred_answer = tokenizer.decode(outputs.predicted_answer_ids)
            metrics.append(compute_metrics(ans_texts, pred_answer, outputs.reader_output))

    stack_metrics = {metric_key: torch.stack((*map(lambda metric: metric[metric_key], metrics), )) for metric_key in metrics[0].keys()}
    print(f"Saving model at checkpoint: {epoch}")
    model.save_pretrained()

