In [1]:
%load_ext autoreload
%autoreload 2

import json

import fairseq
from fairseq.data.encoders.characters import Characters
from fairseq.data.encoders.bytes import Bytes
from fairseq.data.encoders.fastbpe import fastBPE, fastBPEConfig
from fairseq.data.encoders.gpt2_bpe import GPT2BPE, GPT2BPEConfig

In [2]:
ch = Characters()

In [3]:
text = 'Hello World'

In [4]:
ch.encode(text)

'H e l l o ▁ W o r l d'

In [5]:
ch.decode(ch.encode(text))

'Hello World'

In [6]:
byt = Bytes()

In [7]:
byt.encode(text)

'H e l l o ▁ W o r l d'

In [8]:
bpe_cfg = fastBPEConfig(bpe_codes="/mnt/dl/fairseq/Translation/bpe.40000")
bpe = fastBPE(bpe_cfg)

Loading codes from /mnt/dl/fairseq/Translation/bpe.40000 ...
Read 40001 codes from the codes file.


In [9]:
bpe.encode(text)

'H@@ ello World'

In [10]:
bpe.decode(bpe.encode(text))

'Hello World'

In [11]:
gpt2_bpe_cfg = GPT2BPEConfig(gpt2_encoder_json='/mnt/dl/fairseq/Language_Model/wikitext-103-v1/encoder.json',
                            gpt2_vocab_bpe='/mnt/dl/fairseq/Language_Model/wikitext-103-v1/vocab.bpe')

In [12]:
gpt2_bpe = GPT2BPE(gpt2_bpe_cfg)

In [13]:
gpt2_bpe.encode(text)

'15496 2159'

In [14]:
encoder_json = json.load(open('/mnt/dl/fairseq/Language_Model/wikitext-103-v1/encoder.json', 'r', encoding='utf-8'))


In [15]:
encoder_json

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93,
 '¡': 94,
 '¢': 95,
 '£': 96,
 '¤': 97,
 '¥': 98,
 '¦': 99,
 '§': 100

In [16]:
vocab = [tuple(line.strip().split(" ") )
              for i, line in enumerate(open('/mnt/dl/fairseq/Language_Model/wikitext-103-v1/vocab.bpe', 'r')) if i > 0]

In [17]:
vocab

[('Ġ', 't'),
 ('Ġ', 'a'),
 ('h', 'e'),
 ('i', 'n'),
 ('r', 'e'),
 ('o', 'n'),
 ('Ġt', 'he'),
 ('e', 'r'),
 ('Ġ', 's'),
 ('a', 't'),
 ('Ġ', 'w'),
 ('Ġ', 'o'),
 ('e', 'n'),
 ('Ġ', 'c'),
 ('i', 't'),
 ('i', 's'),
 ('a', 'n'),
 ('o', 'r'),
 ('e', 's'),
 ('Ġ', 'b'),
 ('e', 'd'),
 ('Ġ', 'f'),
 ('in', 'g'),
 ('Ġ', 'p'),
 ('o', 'u'),
 ('Ġa', 'n'),
 ('a', 'l'),
 ('a', 'r'),
 ('Ġt', 'o'),
 ('Ġ', 'm'),
 ('Ġo', 'f'),
 ('Ġ', 'in'),
 ('Ġ', 'd'),
 ('Ġ', 'h'),
 ('Ġan', 'd'),
 ('i', 'c'),
 ('a', 's'),
 ('l', 'e'),
 ('Ġt', 'h'),
 ('i', 'on'),
 ('o', 'm'),
 ('l', 'l'),
 ('en', 't'),
 ('Ġ', 'n'),
 ('Ġ', 'l'),
 ('s', 't'),
 ('Ġ', 're'),
 ('v', 'e'),
 ('Ġ', 'e'),
 ('r', 'o'),
 ('l', 'y'),
 ('Ġb', 'e'),
 ('Ġ', 'g'),
 ('Ġ', 'T'),
 ('c', 't'),
 ('Ġ', 'S'),
 ('i', 'd'),
 ('o', 't'),
 ('Ġ', 'I'),
 ('u', 't'),
 ('e', 't'),
 ('Ġ', 'A'),
 ('Ġ', 'is'),
 ('Ġ', 'on'),
 ('i', 'm'),
 ('a', 'm'),
 ('o', 'w'),
 ('a', 'y'),
 ('a', 'd'),
 ('s', 'e'),
 ('Ġth', 'at'),
 ('Ġ', 'C'),
 ('i', 'g'),
 ('Ġf', 'or'),
 ('a', 'c'),
 ('Ġ

In [18]:
text

'Hello World'

In [19]:
'Hello' in encoder_json

True

In [20]:
encoder_json['Hello']

15496

In [21]:
encoder_json['World']

10603

In [22]:
text

'Hello World'

In [23]:
gpt2_bpe.encode(text)

'15496 2159'

In [24]:
encoder_json['world']

6894

In [25]:
encoder_json['ĠWorld']

2159

In [26]:
[k for k, v in encoder_json.items() if v == 2159]

['ĠWorld']

In [27]:
ord('ĠWorld'[0])

288

In [28]:
chr(288)

'Ġ'

In [29]:
ord(' ')

32

In [30]:
gpt2_bpe.encode("Achievement")

'32 24957 434'

In [31]:
def get_symbol(idx):
    return [k for k, v in encoder_json.items() if v == idx][0]

In [32]:
get_symbol(32)

'A'

In [33]:
get_symbol(24957)

'chieve'

In [34]:
get_symbol(434)

'ment'