In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [2]:
class Bert(nn.Module):

    def __init__(self):
        super().__init__()
        self.encoding = tiktoken.get_encoding("r50k_base")
        self.emb_size = self.encoding.n_vocab
        self.emb_channels = 128
        self.max_token_length = 512
        self.emb = nn.Embedding(self.emb_size, self.emb_channels)
        self.positional_encoding = nn.Parameter(torch.randn(self.max_token_length, self.emb_channels))
        

In [3]:
from datasets import load_dataset

dataset = load_dataset('wikipedia', '20220301.en', split='train[:1%]')

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [44]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [51]:
sentence_pairs = []

In [52]:

for i in range(len(dataset[:10])):
    text = dataset[i]['text']
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]
    for s1, s2 in zip(sentences, sentences[1:]):
        sentence_pairs.append((s1, s2))

In [128]:
import random

def mask(sentence):
    for i in range(len(sentence)):
        prob = random.random()
        if prob <= 0.15:
            sub_prob = random.random()
            if sub_prob <= 0.8:
                sentence[i] = tokenizer.mask_token_id
            elif sub_prob > 0.8 or sub_prob <= 0.9:
                sentence[i] = int(random.random() * (tokenizer.vocab_size-1))
    return tokenizer

In [129]:
import random

def generate_bert_input():
    prob = random.random()
    first_sent_idx = int(random.random() * len(sentence_pairs))
    isNext = True
    if prob > 0.5:
        second_sent_idx = first_sent_idx+1
    else:
        second_sent_idx = int(random.random() * len(sentence_pairs))
        isNext = False
    
    first_sent = tokenizer(sentence_pairs[first_sent_idx][0])['input_ids']
    second_sent = tokenizer(sentence_pairs[second_sent_idx][1])['input_ids']

    masked_first_sent = mask(first_sent)
    masked_second_sent = mask(second_sent)

    return {'first_sent': first_sent, 'second_sent': second_sent, 'isNext': isNext, 'masked_first_sent': masked_first_sent, 'masked_second_sent': masked_second_sent}

generate_bert_input()

{'first_sent': [103,
  2012,
  3696,
  100,
  1024,
  8511,
  17151,
  21493,
  10748,
  1998,
  9504,
  1999,
  2060,
  12440,
  103,
  100,
  1024,
  20604,
  3661,
  15669,
  8458,
  1010,
  2013,
  2029,
  1996,
  2206,
  9255,
  2761,
  18547,
  1155,
  1155,
  1024,
  3306,
  3661,
  6541,
  12078,
  2013,
  2029,
  1996,
  2206,
  4144,
  18547,
  103,
  1180,
  1024,
  102],
 'second_sent': [101,
  2214,
  2009,
  103,
  16457,
  1010,
  2029,
  2003,
  1996,
  26722,
  1997,
  103,
  3763,
  1037,
  1024,
  2448,
  2594,
  3661,
  2019,
  6342,
  2480,
  1010,
  103,
  2763,
  12153,
  2013,
  2214,
  2009,
  27072,
  1037,
  1024,
  24224,
  3661,
  17207,
  2050,
  1013,
  5176,
  1219,
  1219,
  1024,
  7508,
  3661,
  1037,
  2100,
  2497,
  9798,
  9537,
  1015,
  2060,
  15066,
  3964,
  3329,
  20564,
  7604,
  6327,
  6971,
  2381,
  1997,
  1996,
  12440,
  103,
  3937,
  3763,
  103,
  12710,
  4144,
  102],
 'isNext': True,
 'masked_first_sent': BertTokenizerFast(na

In [42]:
to_ids = tokenizer(dataset[0]['text'][:100])['input_ids']

print(tokenizer.convert_ids_to_tokens([1012]))

tokenizer.conv



['.']


AttributeError: 'BertTokenizerFast' object has no attribute 'conv'