In [443]:
%load_ext autoreload
%autoreload 2

import os
import argparse
import json
import re
import codecs
import shutil
import io
import tempfile
import torch
import numpy as np
import pandas as pd
import fairseq
from fairseq.data.encoders.gpt2_bpe import GPT2BPE, GPT2BPEConfig
from fairseq.tasks import TASK_REGISTRY
from sentencepiece import SentencePieceProcessor
from fairseq.binarizer import VocabularyDatasetBinarizer, FileBinarizer, AlignmentDatasetBinarizer, BinarizeSummary
from fairseq.data import data_utils
from fairseq.data import Dictionary
from fairseq.data import StripTokenDataset, AppendTokenDataset, TruncateDataset, RandomCropDataset, AppendTokenDataset, PrependTokenDataset, ConcatDataset, PadDataset, TokenBlockDataset, \
    MonolingualDataset, LanguagePairDataset, MaskTokensDataset, NumelDataset, ConcatSentencesDataset, NestedDictionaryDataset, RawLabelDataset

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
TASK_REGISTRY.keys()

dict_keys(['multilingual_masked_lm', 'translation', 'translation_lev', 'translation_multi_simple_epoch', 'speech_unit_modeling', 'hubert_pretraining', 'multilingual_translation', 'language_modeling', 'masked_lm', 'audio_pretraining', 'audio_finetuning', 'multilingual_language_modeling', 'speech_to_text', 'simul_speech_to_text', 'simul_text_to_text', 'legacy_masked_lm', 'sentence_prediction', 'translation_from_pretrained_xlm', 'text_to_speech', 'translation_from_pretrained_bart', 'denoising', 'multilingual_denoising', 'frm_text_to_speech', 'sentence_prediction_adapters', 'online_backtranslation', 'cross_lingual_lm', 'sentence_ranking', 'semisupervised_translation', 'speech_to_speech', 'dummy_lm', 'dummy_masked_lm', 'dummy_mt'])

In [3]:
TASK_REGISTRY

{'multilingual_masked_lm': fairseq.tasks.multilingual_masked_lm.MultiLingualMaskedLMTask,
 'translation': fairseq.tasks.translation.TranslationTask,
 'translation_lev': fairseq.tasks.translation_lev.TranslationLevenshteinTask,
 'translation_multi_simple_epoch': fairseq.tasks.translation_multi_simple_epoch.TranslationMultiSimpleEpochTask,
 'speech_unit_modeling': fairseq.tasks.speech_ulm_task.SpeechUnitLanguageModelingTask,
 'hubert_pretraining': fairseq.tasks.hubert_pretraining.HubertPretrainingTask,
 'multilingual_translation': fairseq.tasks.multilingual_translation.MultilingualTranslationTask,
 'language_modeling': fairseq.tasks.language_modeling.LanguageModelingTask,
 'masked_lm': fairseq.tasks.masked_lm.MaskedLMTask,
 'audio_pretraining': fairseq.tasks.audio_pretraining.AudioPretrainingTask,
 'audio_finetuning': fairseq.tasks.audio_finetuning.AudioFinetuningTask,
 'multilingual_language_modeling': fairseq.tasks.multilingual_language_modeling.MultilingualLanguageModelingTask,
 'spee

In [4]:
len(TASK_REGISTRY)

32

# Pipeline Building

## Translation data

<p> Japanese English dataset  bsd_ja_en</p>

In [6]:
dataset_path = "/mnt/dl/NLP/bsd_ja_en/data/"

In [7]:
translation_task = TASK_REGISTRY["translation"]

In [14]:
en_spm = SentencePieceProcessor(os.path.join(dataset_path, "sentencepiece/train.en.m"))
ja_spm = SentencePieceProcessor(os.path.join(dataset_path, "sentencepiece/train.ja.m"))

In [28]:
new_path = os.path.join(dataset_path, "translation")
os.makedirs(new_path, exist_ok=True)
for name in ["train", "validation", "test"]:
    with open(os.path.join(dataset_path, f"{name}.en")) as src, open(os.path.join(dataset_path, f"{name}.ja")) as tgt, \
        open(os.path.join(new_path, f"{name}.en-ja.en"), "w") as new_src, open(os.path.join(new_path, f"{name}.en-ja.ja"), "w") as new_tgt:
        for line in src:
            new_src.write(" ".join(en_spm.EncodeAsPieces(line)) + "\n")
        for line in tgt:
            new_tgt.write(" ".join(ja_spm.EncodeAsPieces(line)) + "\n")

In [34]:
en_dict = translation_task.build_dictionary([os.path.join(new_path, f"train.en-ja.en")], 
                                            workers=8)

In [36]:
len(en_dict.symbols)

1064

In [46]:
en_dict.get_count(6)

8777

In [51]:
en_dict.index("bos")

3

In [50]:
en_dict[3]

'<unk>'

In [54]:
en_dict.symbols[3]

'<unk>'

In [55]:
en_dict[6]

'▁'

In [57]:
en_dict[2]

'</s>'

In [58]:
ja_dict = translation_task.build_dictionary([os.path.join(new_path, f"train.en-ja.ja")], 
                                            workers=8)

In [59]:
ja_dict[2]

'</s>'

In [60]:
ja_dict[0]

'<s>'

In [61]:
ja_dict[1]

'<pad>'

In [62]:
ja_dict[2]

'</s>'

In [63]:
ja_dict[410]

'次'

In [64]:
ja_dict.index('次')

410

In [78]:
en_dict.save(os.path.join(new_path, "en_dict.en-ja.en"))

In [66]:
en_ja_trans_savepath = os.path.join(new_path, "data-bin")
os.makedirs(en_ja_trans_savepath, exist_ok=True)

In [72]:
parser = argparse.ArgumentParser()

In [75]:
parser.add_argument("--dataset_impl")

_StoreAction(option_strings=['--dataset_impl'], dest='dataset_impl', nargs=None, const=None, default=None, type=None, choices=None, help=None, metavar=None)

In [76]:
args = parser.parse_args("--dataset_impl mmap".split(" ") )

In [77]:
args.dataset_impl

'mmap'

In [80]:
binarizer = VocabularyDatasetBinarizer(en_dict) 

In [81]:
binarizer

<fairseq.binarizer.VocabularyDatasetBinarizer at 0x7f0dc17002e0>

In [85]:
binarizer.binarize_line("▁You ' re ▁ very ▁we l com e .", BinarizeSummary())

tensor([133,  12,  54,   6, 164,  37, 113, 190,  15,   4,   2],
       dtype=torch.int32)

In [87]:
" ".join(en_dict[i] for i in binarizer.binarize_line("▁You ' re ▁ very ▁we l com e .", BinarizeSummary()))

"▁You ' re ▁ very ▁we l com e . </s>"

In [88]:
# Build binary dataset
# train.en-ja.en.bin and  train.en-ja.en.idx
final_summary = FileBinarizer.multiprocess_dataset(
        "/mnt/dl/NLP/bsd_ja_en/data/translation/train.en-ja.en",
        args.dataset_impl,
        binarizer,
        "/mnt/dl/NLP/bsd_ja_en/data/translation/data-bin/train.en-ja.en",
        vocab_size=len(en_dict),
        num_workers=8,
    )

In [90]:
final_summary.num_seq

20000

In [92]:
# Load dataset for training 
en_dataset = data_utils.load_indexed_dataset("/mnt/dl/NLP/bsd_ja_en/data/translation/data-bin/train.en-ja.en",
                                             dictionary=en_dict, dataset_impl=args.dataset_impl)


2023-11-17 20:54:32 | INFO | fairseq.data.data_utils | loaded 20,000 examples from: /mnt/dl/NLP/bsd_ja_en/data/translation/data-bin/train.en-ja.en


In [95]:
en_dataset[0]

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2])

In [97]:
en_dict.string(en_dataset[0])

"▁So ▁let ' s ▁p re t end ▁we ▁have ▁to ▁export ▁a ▁product ▁to ▁Japan ▁today ."

In [98]:
en_dict.encode_line(en_dict.string(en_dataset[0]))

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2], dtype=torch.int32)

In [117]:
en_dict.eos_index, en_dict[2], en_dict.eos()

(2, '</s>', 2)

In [118]:
d1 = StripTokenDataset(en_dataset, en_dict.eos())

In [119]:
en_dataset[0], d1[0]

(tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
          11, 447, 180,   4,   2]),
 tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
          11, 447, 180,   4]))

In [121]:
trunc_d1 = TruncateDataset(d1, 5)

In [122]:
trunc_d1[0]

tensor([ 98, 170,  12,   5,  85])

In [123]:
trunc_d1 = TruncateDataset(d1, 0)

In [124]:
trunc_d1[0]

tensor([], dtype=torch.int64)

In [128]:
append_d1 = AppendTokenDataset(d1, en_dict.eos())

In [129]:
append_d1[0]

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2])

In [132]:
prepend_d1 = PrependTokenDataset(d1, en_dict.bos())

In [133]:
prepend_d1[0]

tensor([  0,  98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14,
        202,  11, 447, 180,   4])

In [134]:
crop_d1 = RandomCropDataset(d1,  5)

In [137]:
crop_d1[0], d1[0]

(tensor([  5,  85,  54,  10, 343]),
 tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
          11, 447, 180,   4]))

In [138]:
crop_d1[0], d1[0]

(tensor([  5,  85,  54,  10, 343]),
 tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
          11, 447, 180,   4]))

In [139]:
crop_d1[1], d1[1]

(tensor([165,  46,  13,  16]), tensor([165,  46,  13,  16]))

In [140]:
crop_d1[56], d1[56]

(tensor([  7, 245,  75,  64,  92]),
 tensor([229,   7, 245,  75,  64,  92,   4]))

In [141]:
len(en_dataset)

20000

In [143]:
concat_d1 = ConcatDataset([en_dataset, en_dataset])

In [144]:
len(concat_d1)

40000

In [145]:
concat_d1[0]

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2])

In [146]:
concat_d1[20000]

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2])

In [153]:
pad_d1 = PadDataset(concat_d1, en_dict.pad(), left_pad=False, pad_length=50)

In [154]:
pad_d1[0]

tensor([ 98, 170,  12,   5,  85,  54,  10, 343,  37,  36,  11, 794,  14, 202,
         11, 447, 180,   4,   2])

## Language Modeling
<p> Wiki-103 </p>

In [158]:
lm_dst_path = "/mnt/dl/NLP/wikitext-103-v1/wikitext-103/subword_bpe/"
wiki_base_fname = "wiki.%s.tokens"

In [157]:
lm_task = TASK_REGISTRY["language_modeling"]

In [166]:
lm_dict = lm_task.build_dictionary([os.path.join(lm_dst_path, wiki_base_fname % "train")], workers=8, padding_factor=8)

In [168]:
len(lm_dict)

41144

In [170]:
lm_vocab_binarizer = VocabularyDatasetBinarizer(lm_dict)

In [172]:
lm_bin_dst_path = os.path.join(lm_dst_path, "data-bin")
os.makedirs(lm_bin_dst_path, exist_ok=True)

In [173]:
lm_file_binarizer = FileBinarizer.multiprocess_dataset(
    os.path.join(lm_dst_path, wiki_base_fname % "train"),
    dataset_impl=args.dataset_impl,
    binarizer=lm_vocab_binarizer, 
    output_prefix=os.path.join(lm_bin_dst_path, wiki_base_fname % "train"),
    vocab_size=len(lm_dict),
    num_workers=8
    )

In [174]:
lm_dict.save(os.path.join(lm_dst_path, "wiki.vocab"))

In [175]:
wiki_dataset = data_utils.load_indexed_dataset(os.path.join(lm_bin_dst_path, wiki_base_fname % "train"))

2023-11-18 02:42:55 | INFO | fairseq.data.data_utils | loaded 1,801,350 examples from: /mnt/dl/NLP/wikitext-103-v1/wikitext-103/subword_bpe/data-bin/wiki.train.tokens


In [176]:
len(wiki_dataset)

1801350

In [177]:
wiki_dataset[0]

tensor([2])

In [178]:
wiki_dataset[1]

tensor([   12, 33606,  6456, 15155,  1309,    12,     2])

In [180]:
lm_dict.string(wiki_dataset[0])

''

In [179]:
lm_dict.string(wiki_dataset[1])

'= Valky@@ ria Chronicles III ='

In [182]:
lm_dict.string(wiki_dataset[3])

'Sen@@ jō no Valky@@ ria 3 : <unk> Chronicles ( Japanese : 戦@@ 場@@ の@@ ヴ@@ ァ@@ ル@@ キ@@ ュ@@ リ@@ ア@@ 3 , lit . Valky@@ ria of the Battlefield 3 ) , commonly referred to as Valky@@ ria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Medi@@ a@@ .@@ Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valky@@ ria series . Emp@@ lo@@ ying the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nam@@ eless " , a penal military unit serving the nation of Gal@@ lia during the Second Europ@@ an War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " .'

In [184]:
wiki_dataset.sizes

array([  1,   7,   1, ..., 166,  14,   1], dtype=int32)

In [197]:
wiki_block_dataset = TokenBlockDataset(wiki_dataset, sizes=wiki_dataset.sizes, 
                                       block_size=16, pad=lm_dict.pad(), 
                                       eos=lm_dict.eos(), include_targets=True, 
                                       break_mode="none") 

In [198]:
len(wiki_block_dataset)

6925646

In [202]:
# source, item, past_target
wiki_block_dataset[0]

(tensor([    2,     2,    12, 33606,  6456, 15155,  1309,    12,     2,     2,
          6082, 30056,   129, 33606,  6456,    92]),
 tensor([    2,    12, 33606,  6456, 15155,  1309,    12,     2,     2,  6082,
         30056,   129, 33606,  6456,    92,    45]),
 tensor([    1,     2,     2,    12, 33606,  6456, 15155,  1309,    12,     2,
             2,  6082, 30056,   129, 33606,  6456]))

In [203]:
torch.hstack([wiki_dataset[0], wiki_dataset[1], wiki_dataset[2], wiki_dataset[3]])

tensor([    2,    12, 33606,  6456, 15155,  1309,    12,     2,     2,  6082,
        30056,   129, 33606,  6456,    92,    45,     3, 15155,    25,   523,
           45, 40160, 40063, 37383, 39585, 37913, 36050, 39860, 39862, 37864,
        38910,    92,     5,  6820,     6, 33606,  6456,     7,     4, 30939,
           92,    24,     5,  2390,  1356,     9,    19, 33606,  6456, 15155,
         1309,   880,   747,     5,    26,    11,  8547,   315,    15,   619,
          309,    84,   485,    23,  6390,     8, 17065,   611,  2266, 14573,
           21,     4,  2552, 18918,     6, 13789,    10,   237,   337,    10,
          747,     5,    31,    26,     4,   246,    84,    10,     4, 33606,
         6456,   126,     6, 22740,  1149,  3999,     4,   163,  7273,     7,
         8547,     8,   938,    15,    63,  2766,    19,    47,  9595,     5,
            4,   353,   705,  3451,     9,     4,    42,    84,     8,  2116,
            4,    13, 12860,  8246,    13,     5,    11, 34100, 

In [204]:
wiki_block_dataset[1]

(tensor([   45,     3, 15155,    25,   523,    45, 40160, 40063, 37383, 39585,
         37913, 36050, 39860, 39862, 37864, 38910]),
 tensor([    3, 15155,    25,   523,    45, 40160, 40063, 37383, 39585, 37913,
         36050, 39860, 39862, 37864, 38910,    92]),
 tensor([   92,    45,     3, 15155,    25,   523,    45, 40160, 40063, 37383,
         39585, 37913, 36050, 39860, 39862, 37864]))

In [206]:
wiki_train_dataset = MonolingualDataset(wiki_block_dataset, sizes=wiki_block_dataset.sizes,
                                        src_vocab=lm_dict, tgt_vocab=lm_dict, 
                                        shuffle=False, targets=["future"], 
                                        add_bos_token=False, fixed_pad_length=None, 
                                        pad_to_bsz=None
                                        )

In [207]:
wiki_train_dataset[0]

{'id': 0,
 'source': tensor([    2,     2,    12, 33606,  6456, 15155,  1309,    12,     2,     2,
          6082, 30056,   129, 33606,  6456,    92]),
 'target': tensor([    2,    12, 33606,  6456, 15155,  1309,    12,     2,     2,  6082,
         30056,   129, 33606,  6456,    92,    45])}

In [208]:
wiki_train_dataset[1]

{'id': 1,
 'source': tensor([   45,     3, 15155,    25,   523,    45, 40160, 40063, 37383, 39585,
         37913, 36050, 39860, 39862, 37864, 38910]),
 'target': tensor([    3, 15155,    25,   523,    45, 40160, 40063, 37383, 39585, 37913,
         36050, 39860, 39862, 37864, 38910,    92])}

In [212]:
# Shuffle
wiki_train_shuffle_dataset = MonolingualDataset(wiki_block_dataset, sizes=wiki_block_dataset.sizes,
                                        src_vocab=lm_dict, tgt_vocab=lm_dict, 
                                        shuffle=True, targets=["future"], 
                                        add_bos_token=False, fixed_pad_length=None, 
                                        pad_to_bsz=None
                                        )

In [214]:
wiki_train_shuffle_dataset[10]

{'id': 10,
 'source': tensor([   2,   16,   84,  143,  478,   10,  307,    5, 2904,   72,   11,  225,
         1676,    7,    4,  139]),
 'target': tensor([  16,   84,  143,  478,   10,  307,    5, 2904,   72,   11,  225, 1676,
            7,    4,  139, 1109])}

In [217]:
wiki_block_dataset[10]

(tensor([   2,   16,   84,  143,  478,   10,  307,    5, 2904,   72,   11,  225,
         1676,    7,    4,  139]),
 tensor([  16,   84,  143,  478,   10,  307,    5, 2904,   72,   11,  225, 1676,
            7,    4,  139, 1109]),
 tensor([   1,    2,   16,   84,  143,  478,   10,  307,    5, 2904,   72,   11,
          225, 1676,    7,    4]))

In [220]:
wiki_train_shuffle_dataset[-1], len(wiki_train_shuffle_dataset[-1]["source"])

({'id': -1,
  'source': tensor([  642,    19,   552,   419,    18, 17159,    17,   538,   309,    84,
          10405,     6,     2]),
  'target': tensor([   19,   552,   419,    18, 17159,    17,   538,   309,    84, 10405,
              6,     2,     2])},
 13)

## Masked Language
Wiki 103 with sentencepiece

In [224]:
wiki_raw_dst_path = '/mnt/dl/NLP/wikitext-103-v1/wikitext-103'
wiki_pieces_dst_path = '/mnt/dl/NLP/wikitext-103-v1/wikitext-103/sentencepieces/'
wiki_bin_dst_path = '/mnt/dl/NLP/wikitext-103-v1/wikitext-103/sentencepieces/data-bin'
os.makedirs(wiki_bin_dst_path, exist_ok=True)

In [272]:
gpt2_bpe_cfg = GPT2BPEConfig(gpt2_encoder_json=os.path.join(wiki_pieces_dst_path, "encoder.json"),
                             gpt2_vocab_bpe=os.path.join(wiki_pieces_dst_path, "vocab.bpe"))

In [273]:
gpt2_bpe = GPT2BPE(gpt2_bpe_cfg)

In [274]:
gpt2_bpe.encode("Hello")

'15496'

In [236]:
# Transform dataset
for split in ["train", "valid", "test"]:
    print(f"Transforming {split} dataset")
    with (open(os.path.join(wiki_raw_dst_path, wiki_base_fname % split), "r") as rf, 
          open(os.path.join(wiki_pieces_dst_path, wiki_base_fname % split), "w") as wf):
        for i, line in enumerate(rf):
            line = line.strip()
            new_line = gpt2_bpe.encode(line)
            print(new_line, file=wf)

Transforming train dataset
Transforming valid dataset
Transforming test dataset


In [237]:
masked_lm_task = TASK_REGISTRY["masked_lm"]

In [238]:
masked_lm_dict = masked_lm_task.load_dictionary(os.path.join(wiki_pieces_dst_path, "dict.txt"))

In [240]:
masked_lm_vocab_binarizer = VocabularyDatasetBinarizer(masked_lm_dict)

In [241]:
masked_lm_binarizer = FileBinarizer.multiprocess_dataset(os.path.join(wiki_pieces_dst_path, "wiki.train.tokens"),
                                                         dataset_impl=args.dataset_impl, binarizer=masked_lm_vocab_binarizer,
                                                         output_prefix=os.path.join(wiki_bin_dst_path, wiki_base_fname % "train"),
                                                         vocab_size=len(masked_lm_dict), num_workers=10)

In [242]:
mask_idx = masked_lm_dict.add_symbol("<mask>")

In [243]:
mask_idx

50264

In [246]:
masked_lm_dict.count[mask_idx]

1

In [247]:
masked_lm_dataset = data_utils.load_indexed_dataset(os.path.join(wiki_bin_dst_path, wiki_base_fname % "train"), 
                                                    masked_lm_dict, dataset_impl=args.dataset_impl)

2023-11-18 06:43:11 | INFO | fairseq.data.data_utils | loaded 1,801,350 examples from: /mnt/dl/NLP/wikitext-103-v1/wikitext-103/sentencepieces/data-bin/wiki.train.tokens


In [248]:
masked_lm_dataset.sizes

array([  1,   8,   1, ..., 172,  15,   1], dtype=int32)

In [249]:
masked_lm_dataset[0]

tensor([2])

In [254]:
masked_lm_dataset[1]

tensor([ 5214,   468, 44068,  6374, 41674,  6395,  5457,     2])

In [250]:
sample_size = 16

In [251]:
masked_lm_token_dataset = TokenBlockDataset(masked_lm_dataset, masked_lm_dataset.sizes, 
                                            block_size=sample_size - 1, pad=masked_lm_dict.pad(), 
                                            eos=masked_lm_dict.eos(), break_mode="none")

In [253]:
masked_lm_token_dataset[0], len(masked_lm_token_dataset[0])

(tensor([    2,  5214,   468, 44068,  6374, 41674,  6395,  5457,     2,     2,
         24365,   267, 38183,   117,   468]),
 15)

In [255]:
# bos as [CLS] for bert models
masked_lm_token_cls_dataset = PrependTokenDataset(masked_lm_token_dataset, masked_lm_dict.bos())

In [256]:
masked_lm_token_cls_dataset[0], len(masked_lm_token_cls_dataset[0])

(tensor([    0,     2,  5214,   468, 44068,  6374, 41674,  6395,  5457,     2,
             2, 24365,   267, 38183,   117,   468]),
 16)

In [258]:
masked_lm_dataset_src, masked_lm_dataset_tgt = MaskTokensDataset.apply_mask(masked_lm_token_cls_dataset, 
                                                                            vocab=masked_lm_dict,
                                                                            pad_idx=masked_lm_dict.pad(),
                                                                            mask_idx=mask_idx,
                                                                            seed=0,
                                                                            mask_prob=0.15,
                                                                            leave_unmasked_prob= 0.1,
                                                                            random_token_prob= 0.1,
                                                                            freq_weighted_replacement=False,
                                                                            mask_whole_words=None,
                                                                            mask_multiple_length= 1,
                                                                            mask_stdev= .0,
                                                                            )

In [261]:
masked_lm_dataset_src[0]

tensor([    0,     2,  5214,   468, 44068, 50264, 41674,  6395,  5457,     2,
            2, 24365, 50264, 38183,   117,   468])

In [265]:
masked_lm_dict.string(masked_lm_dataset_src[0])

'28 569 18354 <mask> 17740 6711 796 10445 <mask> 13090 645 569'

In [266]:
masked_lm_dict.encode_line('28 569 18354 <mask> 17740 6711 796 10445 <mask> 13090 645 569')

tensor([ 5214,   468, 44068, 50264, 41674,  6395,  5457, 24365, 50264, 38183,
          117,   468,     2], dtype=torch.int32)

In [275]:
gpt2_bpe.decode('28 569 18354 <mask> 17740 6711 796 10445 <mask> 13090 645 569')

'= Valky<mask> Chronicles III =Sen<mask>ō no V'

In [276]:
gpt2_bpe.decode(masked_lm_dict.string(masked_lm_token_cls_dataset[0]))

'= Valkyria Chronicles III =Senjō no V'

In [277]:
masked_lm_dataset_src[0], masked_lm_dataset_tgt[0]

(tensor([    0,     2,  5214,   468, 44068, 50264, 41674,  6395,  5457,     2,
             2, 24365, 50264, 38183,   117,   468]),
 tensor([   1,    1,    1,    1,    1, 6374,    1,    1,    1,    1,    1,    1,
          267,    1,    1,    1]))

In [278]:
masked_lm_token_cls_dataset[0]

tensor([    0,     2,  5214,   468, 44068,  6374, 41674,  6395,  5457,     2,
            2, 24365,   267, 38183,   117,   468])

In [282]:
NumelDataset(masked_lm_dataset_src, )[20]

16

In [283]:
NumelDataset(masked_lm_dataset_src, )[len(masked_lm_dataset_src) - 1]

11

In [298]:
# Try break mode in token block used for pretraining
masked_lm_token_dataset = TokenBlockDataset(masked_lm_dataset, masked_lm_dataset.sizes, 
                                            block_size=sample_size - 1, pad=masked_lm_dict.pad(), 
                                            eos=masked_lm_dict.eos(), break_mode="complete")

In [299]:
masked_lm_token_dataset[0]

tensor([    2,  5214,   468, 44068,  6374, 41674,  6395,  5457,     2,     2])

In [300]:
masked_lm_token_dataset[1]

tensor([24365,   267, 38183,   117,   468, 44068,  6374,   155,  4832, 28696,
         6435, 15698, 41674,    36,  2898,  4832, 47416, 23133, 18164, 42393,
        21402, 20024, 48018, 50033, 49080, 49587, 49432, 48947, 49017,   246,
         2156,  6474,   479,   468, 44068,  6374,     9,     5, 36954,   155,
         4839,  2156, 10266,  4997,     7,    25,   468, 44068,  6374, 41674,
         6395,   751,  1429,  2156,    16,    10, 15714,   774,   787,    12,
         1039,   816,   569,   177,  2226,    30, 43561,     8,  2454,     4,
        36753,    13,     5, 15592, 39435,   479, 30939,    11,   644,  1466,
           11,  1429,  2156,    24,    16,     5,   371,   177,    11,     5,
          468, 44068,  6374,   651,   479, 23564,   154,     5,   276, 24904,
            9, 15714,     8,   588,   787,    12,  1039,    86, 23841,    25,
           63, 20193,  2156,     5,   527,  1237, 12980,     7,     5,    78,
          177,     8,  3905,     5,    22,  8603, 13802,    22, 

In [301]:
masked_lm_token_dataset[2]

tensor([  133,   177,   880,   709,    11,  1824,  2156,  3406,    81,    10,
          739,  4745,     9,     5,   173,   626,    15,   468, 44068,  6374,
        41674,  3082,   479,   616,    24, 12544,     5,  2526,  1575,     9,
            5,   651,  2156,    24,    67, 12796,  1533, 11431,  2156,   215,
           25,   442,     5,   177,    55, 36341,    13,   651, 19298,   479,
        35177,  6004, 28696,  6435, 15698,  8768,   267,  1438,     8, 17964,
        15225, 23552, 17040, 36066,   258,  1835,    31,   986, 11410,  2156,
          552,    19,   468, 44068,  6374, 41674,  3082,   736, 29072,  3592,
        10548,  6498,   479,    83,   739,   165,     9,  6737,  7521,     5,
         8543,   479,    20,   177,   128,    29,  1273,  4782,    21, 26115,
           30,   392,   128,   282,   479,     2])

In [302]:
masked_lm_token_dataset[2].size()

torch.Size([106])

In [303]:
masked_lm_token_cls_dataset2 = PrependTokenDataset(masked_lm_token_dataset, masked_lm_dict.bos())

In [304]:
masked_lm_token_cls_dataset2[0]

tensor([    0,     2,  5214,   468, 44068,  6374, 41674,  6395,  5457,     2,
            2])

In [305]:
masked_lm_token_cls_dataset2[1]

tensor([    0, 24365,   267, 38183,   117,   468, 44068,  6374,   155,  4832,
        28696,  6435, 15698, 41674,    36,  2898,  4832, 47416, 23133, 18164,
        42393, 21402, 20024, 48018, 50033, 49080, 49587, 49432, 48947, 49017,
          246,  2156,  6474,   479,   468, 44068,  6374,     9,     5, 36954,
          155,  4839,  2156, 10266,  4997,     7,    25,   468, 44068,  6374,
        41674,  6395,   751,  1429,  2156,    16,    10, 15714,   774,   787,
           12,  1039,   816,   569,   177,  2226,    30, 43561,     8,  2454,
            4, 36753,    13,     5, 15592, 39435,   479, 30939,    11,   644,
         1466,    11,  1429,  2156,    24,    16,     5,   371,   177,    11,
            5,   468, 44068,  6374,   651,   479, 23564,   154,     5,   276,
        24904,     9, 15714,     8,   588,   787,    12,  1039,    86, 23841,
           25,    63, 20193,  2156,     5,   527,  1237, 12980,     7,     5,
           78,   177,     8,  3905,     5,    22,  8603, 13802, 

In [306]:
masked_lm_dataset_src, masked_lm_dataset_tgt = MaskTokensDataset.apply_mask(masked_lm_token_cls_dataset2, 
                                                                            vocab=masked_lm_dict,
                                                                            pad_idx=masked_lm_dict.pad(),
                                                                            mask_idx=mask_idx,
                                                                            seed= 0,
                                                                            mask_prob=0.15,
                                                                            leave_unmasked_prob= 0.1,
                                                                            random_token_prob= 0.1,
                                                                            freq_weighted_replacement=False,
                                                                            mask_whole_words=None,
                                                                            mask_multiple_length= 1,
                                                                            mask_stdev= .0,
                                                                            )

In [307]:
masked_lm_dataset_src[0]

tensor([    0,     2,  5214,   468, 50264,  6374, 41674,  6395,  5457,     2,
            2])

In [309]:
# During training this segment of text will be cut off to fit the transformer input size
masked_lm_dataset_src[1]

tensor([    0, 24365,   267, 38183,   117,   468, 44068,  6374,   155,  4832,
        28696,  6435, 15698, 41674,    36,  2898,  4832, 47416, 23133, 18164,
        42393, 21402, 20024, 48018, 50033, 49080, 49587, 49432, 50264, 50264,
          246, 50264,  6474,   479,   468, 44068,  6374,     9,     5, 36954,
          155,  4839,  2156, 10266,  4997,     7,    25,   468, 44068, 50264,
        41674, 50264,   751,  1429,  2156,    16,    10, 15714,   774,   787,
           12,  1039,   816, 50264,   177,  2226,    30, 43561,     8,  2454,
            4, 36753,    13,     5, 15592, 39435,   479, 30939,    11,   644,
        50264,    11,  1429,  2156,    24, 50264,     5,   371,   177,    11,
            5,   468, 44068, 50264,   651,   479, 23564, 50264,     5,   276,
        24904,     9, 15714,     8, 50264,   787,    12,  1039, 50264, 23841,
           25,    63, 20193,  2156,     5,   527,  1237, 50264,     7,     5,
           78, 50264, 50264,  3905,     5,    22,  8603, 50264, 

In [324]:
lm_dict[0]

'<s>'

RACE finetuning. Sentence Ranking

In [367]:
## Process race files
race_raw_path = "/mnt/dl/NLP/RACE"
race_raw_processed = "/mnt/dl/NLP/RACE/processed"
os.makedirs(race_raw_processed, exist_ok=True)
splits = ["train"]
levels = ["high", "middle"]
for split in splits:
    os.makedirs(os.path.join(race_raw_processed, split), exist_ok=True)
    split_path = os.path.join(race_raw_path, split)
    save_inputs = dict()
    for col in ["input0", "input1", "input2", "input3", "input4", "label"]:
        save_inputs[col] = codecs.open(os.path.join(race_raw_processed, split, col + ".txt"), "w")
    samples = []
    for level in levels:
        level_path = os.path.join(split_path, level)
        for fname in sorted(os.listdir(level_path), key=lambda x: int(x.replace(".txt", ""))):
            curr_file = os.path.join(level_path, fname)
            with open(curr_file, encoding="utf-8") as f:
                contents = json.load(f)
                answers = contents["answers"]
                questions = contents["questions"]
                article = contents["article"].replace("\n", " ")
                for i, answer in enumerate(answers):
                    label = ord(answer) - ord("A")
                    question = questions[i]
                    options = contents["options"][i]
                    print(article, file=save_inputs["input0"])
                    print(str(label), file=save_inputs["label"])
                    for j, option in enumerate(options):
                        if "_" in question:
                            qa = question.replace("_", option)
                        else:
                            qa = " ".join([question, option])
                        qa = re.sub("\s+", " ", qa)
                        if qa[-1] == qa[-3] == ".":
                            qa = qa[:-3] + qa[-2: len(qa)]
                        print(qa, file=save_inputs[f"input{j+1}"])
    for _, f in save_inputs.items():
        f.close()


In [368]:
race_data_path = "/mnt/dl/NLP/RACE/processed/sentencepiece/"
os.makedirs(race_data_path, exist_ok=True)

In [373]:
gpt2_bpe_cfg = GPT2BPEConfig(gpt2_encoder_json=os.path.join(race_data_path, "encoder.json"), 
                             gpt2_vocab_bpe=os.path.join(race_data_path, "vocab.bpe"))
gpt2_bpe = GPT2BPE(cfg=gpt2_bpe_cfg)
for _, fname in save_inputs.items():
    fname = fname.name
    print(fname)
    if "label" in fname:
        shutil.copyfile(fname, os.path.join(race_data_path, "train.label.txt"))
        continue
    with open(fname) as rf, open(os.path.join(race_data_path, f"train.{os.path.basename(fname)}"), "w") as wf:
        for line in rf:
            print(gpt2_bpe.encode(line), file=wf)       

/mnt/dl/NLP/RACE/processed/train/input0.txt
/mnt/dl/NLP/RACE/processed/train/input1.txt
/mnt/dl/NLP/RACE/processed/train/input2.txt
/mnt/dl/NLP/RACE/processed/train/input3.txt
/mnt/dl/NLP/RACE/processed/train/input4.txt
/mnt/dl/NLP/RACE/processed/train/label.txt


In [374]:
# Skipping this building scheme
task_race_lm = TASK_REGISTRY["sentence_ranking"]

In [378]:
task_race_dict = Dictionary.load(os.path.join(race_data_path, "dict.txt"))

In [379]:
mask_idx = task_race_dict.add_symbol("<mask>")

In [381]:
mask_idx

50264

In [382]:
race_data_path

'/mnt/dl/NLP/RACE/processed/sentencepiece/'

In [384]:
task_race_files = list(filter(lambda x: "input" in x, os.listdir(race_data_path)))
task_race_files.sort()
task_race_files

['train.input0.txt',
 'train.input1.txt',
 'train.input2.txt',
 'train.input3.txt',
 'train.input4.txt']

In [385]:
task_race_bin_path = os.path.join(race_data_path, "data-bin")
os.makedirs(task_race_bin_path, exist_ok=True)
binarizer = VocabularyDatasetBinarizer(task_race_dict)
for fname in task_race_files:
    FileBinarizer.multiprocess_dataset(
        os.path.join(race_data_path, fname), dataset_impl=args.dataset_impl,
        binarizer=binarizer, 
        output_prefix=os.path.join(task_race_bin_path, fname.replace(".txt", "")),
        vocab_size=len(task_race_dict),
        num_workers=8
        )

In [392]:
# Load dataset
datasets = dict()
for i in range(5):
    datasets[f"input{i}"] = data_utils.load_indexed_dataset(os.path.join(task_race_bin_path, f"train.input{i}"),
                                                            dictionary=task_race_dict, dataset_impl=args.dataset_impl)

2023-11-19 04:44:11 | INFO | fairseq.data.data_utils | loaded 87,866 examples from: /mnt/dl/NLP/RACE/processed/sentencepiece/data-bin/train.input0
2023-11-19 04:44:11 | INFO | fairseq.data.data_utils | loaded 87,866 examples from: /mnt/dl/NLP/RACE/processed/sentencepiece/data-bin/train.input1
2023-11-19 04:44:11 | INFO | fairseq.data.data_utils | loaded 87,866 examples from: /mnt/dl/NLP/RACE/processed/sentencepiece/data-bin/train.input2
2023-11-19 04:44:11 | INFO | fairseq.data.data_utils | loaded 87,866 examples from: /mnt/dl/NLP/RACE/processed/sentencepiece/data-bin/train.input3
2023-11-19 04:44:11 | INFO | fairseq.data.data_utils | loaded 87,866 examples from: /mnt/dl/NLP/RACE/processed/sentencepiece/data-bin/train.input4


In [393]:
datasets["input0"]

<fairseq.data.indexed_dataset.MMapIndexedDataset at 0x7f0dfde1b700>

In [394]:
datasets["input0"].sizes

array([381, 381, 381, ..., 240, 240, 240], dtype=int32)

In [395]:
datasets["input0"][0]

tensor([ 2387,  1623,    16,    10,  2421, 14172,  5961,     4,    91,  6138,
            7,   356,    23,   383,     8,     7,  2842,   106,     4,    91,
         3829,     7,  8933,   850,   227,     5,   276,  1964,    11,   430,
         6464,     4,    91,    74,   393,   206,     9,  2159,   932,   396,
          546,   198,    11,   484,   430,  6464,     4,   374,     5,    97,
          865,     6,    38,   437,    45,    10, 14172,  5961,     4,    38,
          206,  3482,    16, 15305,     8, 26262,     4,   318,    38,   101,
          402,     8,    38,    33,   615,   418,     7,   185,    24,     6,
           38,   907,    24,    23,   683,     4,    38,   393,   356,   198,
           13,    10,   205,   425,    50,    10,   357,   432,     4,  1525,
          768,   127,  1623,     8,    38,   393,   213,  3482,   561,     4,
        22008,  3482,   561,    74,    28,   350,  8661,    13,   258,     9,
          201,     4,   520,    24,   606,     7,  3482,     6, 

In [396]:
input0 = datasets["input0"]

In [398]:
# Prepend eos to input 0
input0 = PrependTokenDataset(input0, task_race_dict.eos())

In [407]:
input_q = []
q_inp_length = 128
max_positions = 512
for i in range(1, 5):
    input_dataset = datasets[f"input{i}"]
    # Prepend bos
    input_dataset = PrependTokenDataset(input_dataset, task_race_dict.bos())
    # Shorten the question input
    input_dataset = TruncateDataset(input_dataset, q_inp_length)
    # Concat the questions + articles
    input_dataset = ConcatSentencesDataset(input_dataset, input0)
    # Truncation for limiting in the networks 
    input_dataset = TruncateDataset(input_dataset, max_positions)
    input_q.append(input_dataset)

In [409]:
datasets["input1"][0]

tensor([  133,  1623,  3829,  3482,   142,    37,    34,   203,   418,   479,
        50118,     2])

In [408]:
input_q[0][0]

tensor([    0,   133,  1623,  3829,  3482,   142,    37,    34,   203,   418,
          479, 50118,     2,     2,  2387,  1623,    16,    10,  2421, 14172,
         5961,     4,    91,  6138,     7,   356,    23,   383,     8,     7,
         2842,   106,     4,    91,  3829,     7,  8933,   850,   227,     5,
          276,  1964,    11,   430,  6464,     4,    91,    74,   393,   206,
            9,  2159,   932,   396,   546,   198,    11,   484,   430,  6464,
            4,   374,     5,    97,   865,     6,    38,   437,    45,    10,
        14172,  5961,     4,    38,   206,  3482,    16, 15305,     8, 26262,
            4,   318,    38,   101,   402,     8,    38,    33,   615,   418,
            7,   185,    24,     6,    38,   907,    24,    23,   683,     4,
           38,   393,   356,   198,    13,    10,   205,   425,    50,    10,
          357,   432,     4,  1525,   768,   127,  1623,     8,    38,   393,
          213,  3482,   561,     4, 22008,  3482,   561,    74, 

In [412]:
# Strip the token eos from the question and merge with article is a better solution
StripTokenDataset(datasets["input1"], task_race_dict.eos())[0]

tensor([  133,  1623,  3829,  3482,   142,    37,    34,   203,   418,   479,
        50118])

In [419]:
race_dataset = {f"net_input{idx}": input_q[idx - 1] for idx in range(1, 5)}

In [420]:
race_dataset = NestedDictionaryDataset(race_dataset)

In [422]:
race_dataset[0]

OrderedDict([('net_input1',
              tensor([    0,   133,  1623,  3829,  3482,   142,    37,    34,   203,   418,
                        479, 50118,     2,     2,  2387,  1623,    16,    10,  2421, 14172,
                       5961,     4,    91,  6138,     7,   356,    23,   383,     8,     7,
                       2842,   106,     4,    91,  3829,     7,  8933,   850,   227,     5,
                        276,  1964,    11,   430,  6464,     4,    91,    74,   393,   206,
                          9,  2159,   932,   396,   546,   198,    11,   484,   430,  6464,
                          4,   374,     5,    97,   865,     6,    38,   437,    45,    10,
                      14172,  5961,     4,    38,   206,  3482,    16, 15305,     8, 26262,
                          4,   318,    38,   101,   402,     8,    38,    33,   615,   418,
                          7,   185,    24,     6,    38,   907,    24,    23,   683,     4,
                         38,   393,   356,   198,   

In [431]:
race_dataset = {f"net_input{idx}": {"tokens": PadDataset( input_q[idx - 1], task_race_dict.pad(), 
                                                         left_pad=False, ),
                                    "sizes": NumelDataset(input_q[idx - 1])} for idx in range(1, 5)}

In [432]:
race_dataset = NestedDictionaryDataset(race_dataset, 
                                       sizes=[np.maximum.reduce([iq.sizes for iq in input_q])])

In [433]:
race_dataset[0]

OrderedDict([('net_input1.tokens',
              tensor([    0,   133,  1623,  3829,  3482,   142,    37,    34,   203,   418,
                        479, 50118,     2,     2,  2387,  1623,    16,    10,  2421, 14172,
                       5961,     4,    91,  6138,     7,   356,    23,   383,     8,     7,
                       2842,   106,     4,    91,  3829,     7,  8933,   850,   227,     5,
                        276,  1964,    11,   430,  6464,     4,    91,    74,   393,   206,
                          9,  2159,   932,   396,   546,   198,    11,   484,   430,  6464,
                          4,   374,     5,    97,   865,     6,    38,   437,    45,    10,
                      14172,  5961,     4,    38,   206,  3482,    16, 15305,     8, 26262,
                          4,   318,    38,   101,   402,     8,    38,    33,   615,   418,
                          7,   185,    24,     6,    38,   907,    24,    23,   683,     4,
                         38,   393,   356,   

In [446]:
race_task_labels = [int(x) for x in open(os.path.join(race_data_path, "train.label.txt"))]

In [449]:
label_dataset = RawLabelDataset(race_task_labels)

In [450]:
label_dataset[0]

2

In [451]:
race_dataset = {f"net_input{idx}": {"tokens": PadDataset( input_q[idx - 1], task_race_dict.pad(), 
                                                         left_pad=False, ),
                                    "sizes": NumelDataset(input_q[idx - 1])} for idx in range(1, 5)}
race_dataset.update({"target": label_dataset})

In [452]:
race_dataset = NestedDictionaryDataset(race_dataset, 
                                       sizes=[np.maximum.reduce([iq.sizes for iq in input_q])])

In [453]:
race_dataset[0]

OrderedDict([('net_input1.tokens',
              tensor([    0,   133,  1623,  3829,  3482,   142,    37,    34,   203,   418,
                        479, 50118,     2,     2,  2387,  1623,    16,    10,  2421, 14172,
                       5961,     4,    91,  6138,     7,   356,    23,   383,     8,     7,
                       2842,   106,     4,    91,  3829,     7,  8933,   850,   227,     5,
                        276,  1964,    11,   430,  6464,     4,    91,    74,   393,   206,
                          9,  2159,   932,   396,   546,   198,    11,   484,   430,  6464,
                          4,   374,     5,    97,   865,     6,    38,   437,    45,    10,
                      14172,  5961,     4,    38,   206,  3482,    16, 15305,     8, 26262,
                          4,   318,    38,   101,   402,     8,    38,    33,   615,   418,
                          7,   185,    24,     6,    38,   907,    24,    23,   683,     4,
                         38,   393,   356,   

<strong> For other GLUE tasks like rte cola same procedures as RACE </strong>
<br />
<strong> Network input:  [BOS] Question [EOS] Article</strong>

GLUE: RTE Sentence prediction

In [436]:
rte_raw_dataset = os.path.join("/mnt/dl/NLP/GLUE/RTE/")

In [438]:
rte_processed_dataset = os.path.join(rte_raw_dataset, "processed")
os.makedirs(rte_processed_dataset, exist_ok=True)
rte_bin_dataset = os.path.join(rte_processed_dataset, "data-bin")
os.makedirs(rte_bin_dataset, exist_ok=True)


In [461]:
df = pd.read_csv(os.path.join(rte_raw_dataset, "train.tsv"), delimiter="\t")[["sentence1", "sentence2", "label"]]
df

Unnamed: 0,sentence1,sentence2,label
0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,not_entailment
1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,entailment
2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,entailment
3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,entailment
4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,not_entailment
...,...,...,...
2485,There is none. They found as many weapons in t...,Weapons of mass destruction found in Iraq.,not_entailment
2486,"Dr. Eric Goosby, a pioneer in the fight agains...",Pepfar is committed to fighting AIDS.,entailment
2487,"NASA's Saturn exploration spacecraft, Cassini ...",Titan is the fifteenth of Saturn's known satel...,not_entailment
2488,Brooklyn Borough Hall featured a Who's Who in ...,The Brooklyn Book Festival is held in Brooklyn...,entailment


In [462]:
rte_targets = df["label"].unique()
rte_targets

array(['not_entailment', 'entailment', nan], dtype=object)

In [465]:
df = df[~(df["label"].isna())]

In [466]:
rte_targets = df["label"].unique()
rte_targets

array(['not_entailment', 'entailment'], dtype=object)

In [467]:
rte_targets = dict(zip(rte_targets, range(len(rte_targets))))
rte_targets

{'not_entailment': 0, 'entailment': 1}

In [468]:
df["label"] = df["label"].apply(lambda x: rte_targets[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].apply(lambda x: rte_targets[x])


In [469]:
df

Unnamed: 0,sentence1,sentence2,label
0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,0
1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,1
2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,1
3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,1
4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,0
...,...,...,...
2485,There is none. They found as many weapons in t...,Weapons of mass destruction found in Iraq.,0
2486,"Dr. Eric Goosby, a pioneer in the fight agains...",Pepfar is committed to fighting AIDS.,1
2487,"NASA's Saturn exploration spacecraft, Cassini ...",Titan is the fifteenth of Saturn's known satel...,0
2488,Brooklyn Borough Hall featured a Who's Who in ...,The Brooklyn Book Festival is held in Brooklyn...,1


In [470]:
rte_processed_dataset

'/mnt/dl/NLP/GLUE/RTE/processed'

In [471]:
gpt2_bpe_cfg = GPT2BPEConfig(gpt2_encoder_json=os.path.join(rte_processed_dataset, "encoder.json"), 
                             gpt2_vocab_bpe=os.path.join(rte_processed_dataset, "vocab.bpe"))
gpt2_bpe = GPT2BPE(cfg=gpt2_bpe_cfg)

In [477]:
df["input0"] = df["sentence1"].apply(lambda x: gpt2_bpe.encode(str(x)))
df["input1"] = df["sentence2"].apply(lambda x: gpt2_bpe.encode(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["input0"] = df["sentence1"].apply(lambda x: gpt2_bpe.encode(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["input1"] = df["sentence2"].apply(lambda x: gpt2_bpe.encode(x))


In [478]:
df

Unnamed: 0,sentence1,sentence2,label,input0,input1
0,No Weapons of Mass Destruction Found in Iraq Yet.,Weapons of Mass Destruction Found in Iraq.,0,2949 18944 286 5674 25034 4062 287 3908 6430 13,41818 286 5674 25034 4062 287 3908 13
1,"A place of sorrow, after Pope John Paul II die...",Pope Benedict XVI is the new leader of the Rom...,1,32 1295 286 24140 11 706 13258 1757 3362 2873 ...,46172 28697 49090 318 262 649 3554 286 262 799...
2,Herceptin was already approved to treat the si...,Herceptin can be used to treat breast cancer.,1,9360 984 259 373 1541 6325 284 2190 262 6639 3...,9360 984 259 460 307 973 284 2190 9296 4890 13
3,"Judie Vivian, chief executive at ProMedica, a ...",The previous name of Ho Chi Minh City was Saigon.,1,26141 494 25313 666 11 4039 4640 379 1041 9921...,464 2180 1438 286 9544 21380 1855 71 2254 373 ...
4,A man is due in court later charged with the m...,Paul Stewart Hutchinson is accused of having s...,0,32 582 318 2233 287 2184 1568 5047 351 262 512...,12041 13671 43792 318 5371 286 1719 21512 257 ...
...,...,...,...,...,...
2485,There is none. They found as many weapons in t...,Weapons of mass destruction found in Iraq.,0,1858 318 4844 13 1119 1043 355 867 3777 287 42...,41818 286 2347 8166 1043 287 3908 13
2486,"Dr. Eric Goosby, a pioneer in the fight agains...",Pepfar is committed to fighting AIDS.,1,6187 13 7651 1514 418 1525 11 257 29570 287 26...,47 538 16370 318 5364 284 4330 20408 13
2487,"NASA's Saturn exploration spacecraft, Cassini ...",Titan is the fifteenth of Saturn's known satel...,0,29998 338 23135 13936 16807 11 14154 5362 837 ...,51 18642 318 262 5515 20283 286 23135 338 1900...
2488,Brooklyn Borough Hall featured a Who's Who in ...,The Brooklyn Book Festival is held in Brooklyn...,1,45534 6213 48114 4789 8096 257 5338 338 5338 2...,464 12232 4897 11117 318 2714 287 12232 48114 ...


In [506]:
for inp_type in ["input0", "input1", "label"]:
    with open(os.path.join(rte_processed_dataset, f"train.{inp_type}.txt"), "w") as f:
        for line in df[inp_type]:
            print(line, file=f)   

In [482]:
rte_dict = Dictionary.load(os.path.join(rte_processed_dataset, "dict.txt"))

In [483]:
rte_dict

<fairseq.data.dictionary.Dictionary at 0x7f0d854e82b0>

In [484]:
rte_dict.add_symbol("<mask>")

50264

In [485]:
rte_binarizer = VocabularyDatasetBinarizer(rte_dict)

In [508]:
for inp_type in ["input0", "input1", "label"]:
    input_file = os.path.join(rte_processed_dataset, f"train.{inp_type}.txt")
    print(input_file)
    FileBinarizer.multiprocess_dataset(num_workers=4, dataset_impl=args.dataset_impl, binarizer=rte_binarizer, 
                                       output_prefix=os.path.join(rte_bin_dataset, f"train.{inp_type}"),
                                       vocab_size=len(rte_dict), input_file=input_file)

/mnt/dl/NLP/GLUE/RTE/processed/train.input0.txt
/mnt/dl/NLP/GLUE/RTE/processed/train.input1.txt
/mnt/dl/NLP/GLUE/RTE/processed/train.label.txt


In [521]:
input0 = data_utils.load_indexed_dataset(os.path.join(rte_bin_dataset, f"train.input0"), 
                                         dictionary=rte_dict,
                                         dataset_impl=args.dataset_impl)

2023-11-19 07:08:09 | INFO | fairseq.data.data_utils | loaded 2,489 examples from: /mnt/dl/NLP/GLUE/RTE/processed/data-bin/train.input0


In [522]:
gpt2_bpe.decode(rte_dict.string(input0[0]))

'No Weapons of Mass Destruction Found in Iraq Yet.'

In [523]:
input1 = data_utils.load_indexed_dataset(os.path.join(rte_bin_dataset, f"train.input1"), 
                                         dictionary=rte_dict,
                                         dataset_impl=args.dataset_impl
                                         )

2023-11-19 07:08:10 | INFO | fairseq.data.data_utils | loaded 2,489 examples from: /mnt/dl/NLP/GLUE/RTE/processed/data-bin/train.input1


In [524]:
input1[0]

tensor([48637,     9,  5370, 43207, 11911,    11,  3345,     4,     2])

In [525]:
input0 = PrependTokenDataset(input0, rte_dict.bos())

In [526]:
input1 = PrependTokenDataset(input1, rte_dict.eos())

In [532]:
rte_tokens = ConcatSentencesDataset(StripTokenDataset(input0, rte_dict.eos()), input1)

In [533]:
rte_tokens[0]

tensor([    0,  3084, 28054,     9,  5370, 43207, 11911,    11,  3345,  3507,
            4,     2, 48637,     9,  5370, 43207, 11911,    11,  3345,     4,
            2])

In [534]:
rte_tokens = TruncateDataset(rte_tokens, 512)

In [535]:
rte_tokens[0]

tensor([    0,  3084, 28054,     9,  5370, 43207, 11911,    11,  3345,  3507,
            4,     2, 48637,     9,  5370, 43207, 11911,    11,  3345,     4,
            2])

## Multilanguage Modeling

# Steps:
1. Binarize
2. Load dictionary for each language
3. Add token __{lang}__ to each dictionary