# Get text data from LXMERT pretraining data
For bert-base-uncased text-only pre-training.

First, download the `mscoco_train.json`, `mscoco_nominival.json`, `vgnococo.json` and `mscoco_minival.json` data files as described in https://github.com/airsplay/lxmert#pre-training and put them under `models/data/lxmert`.

In [None]:
%cd ../../data/lxmert

In [18]:
import json
import numpy as np

In [10]:
TASKS_WITH_LABELS = {"vqa", "gqa", "visual7w"}
TASK_NAMES = {"vqa", "gqa", "visual7w", "mscoco", "vg"}

In [51]:
def save_mlm_text_from_lxmert_datafile(datafile, to_file):
    with open(datafile, "r") as f:
        data = json.load(f)

    with open(to_file, "a") as f:
        for example in data:
            task_names = example["sentf"].keys()
            for task_name in task_names:
                assert task_name in TASK_NAMES
                # questions should have their answers appended
                if task_name in TASKS_WITH_LABELS:
                    for ex_ix, ex_part in enumerate(example["sentf"][task_name]):
                        ans_alternatives = example["labelf"][task_name][ex_ix]
                        # for some reason, some questions don't have answers in the data. skip them
                        if len(ans_alternatives) == 0:
                            continue
                        best_ans = list(ans_alternatives.keys())[np.argmax(list(ans_alternatives.values()))]
                        text = ex_part + " " + best_ans.capitalize()
                        text = text.strip()
                        if text[-1] not in {'!', '.'}:
                            text = text + "."
                        json_entry = {"text": text}
                        json.dump(json_entry, f)
                        f.write("\n")
                # captions are just added as they are
                else:
                    for _, ex_part in enumerate(example["sentf"][task_name]):
                        json_entry = {"text": ex_part.strip()}
                        json.dump(json_entry, f)
                        f.write("\n")
    
    return

In [53]:
OUT_FILE = "train_mlm.jsonl"
open(OUT_FILE, 'w').close() #clear the file
save_mlm_text_from_lxmert_datafile("mscoco_nominival.json", OUT_FILE)
save_mlm_text_from_lxmert_datafile("mscoco_train.json", OUT_FILE)
save_mlm_text_from_lxmert_datafile("vgnococo.json", OUT_FILE)

In [54]:
OUT_FILE = "val_mlm.jsonl"
open(OUT_FILE, 'w').close() #clear the file
save_mlm_text_from_lxmert_datafile("mscoco_minival.json", OUT_FILE)