In [5]:
import os
from pathlib import Path
import torch
import re
import random
from datasets import load_dataset
import transformers, datasets
from tokenizers import BertWordPieceTokenizer # to be replaced by own?
from transformers import BertTokenizer # to be replaced by own?
from torch.utils.data import Dataset, DataLoader
import itertools
import math
import torch.nn.functional as F
import numpy as np
from torch.optim import Adam
from sklearn.model_selection import train_test_split

In [8]:
# hyperparameters for BERT base
if __name__ == '__main__':
    VOCAB_SIZE = 30000
    N_SEGMENTS = 3
    MAX_LENGTH = 512
    EMBEDDING_DIM = 768
    N_LAYERS = 12
    ATTENTION_HEADS = 12
    DROPOUT = 0.1

In [9]:
# helper function to read lines from the text file as a generator
def load_lines(filename):
    with open(filename, 'r') as file:
        for line in file:
            yield line.strip()

In [27]:
# helper function to read lines from the text file as a generator
def load_line_pairs(filename):
    with open(filename, 'r') as file:
        prev_line = None
        for line in file:
            curr_line = line.strip()
            if prev_line is not None:
                yield [prev_line, curr_line]
            prev_line = curr_line

In [28]:
# helper function to apply transformation if provided
def apply_transform(sample):
        if transformation is not None:
            return transformation(sample)
        else:
            return sample

In [29]:
# TO DO (copied from https://medium.com/data-and-beyond/complete-guide-to-building-bert-model-from-sratch-3e6562228891)
class BERTDataset(Dataset):
    def __init__(self, data_pair, tokenizer, seq_len=64):

        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_lines = len(data_pair)
        self.lines = data_pair

    def __len__(self):
        return self.corpus_lines

    def __getitem__(self, item):

        # Step 1: get random sentence pair, either negative or positive (saved as is_next_label)
        t1, t2, is_next_label = self.get_sent(item)

        # Step 2: replace random words in sentence with mask / random words
        t1_random, t1_label = self.random_word(t1)
        t2_random, t2_label = self.random_word(t2)

        # Step 3: Adding CLS and SEP tokens to the start and end of sentences
         # Adding PAD token for labels
        t1 = [self.tokenizer.vocab['[CLS]']] + t1_random + [self.tokenizer.vocab['[SEP]']]
        t2 = t2_random + [self.tokenizer.vocab['[SEP]']]
        t1_label = [self.tokenizer.vocab['[PAD]']] + t1_label + [self.tokenizer.vocab['[PAD]']]
        t2_label = t2_label + [self.tokenizer.vocab['[PAD]']]

        # Step 4: combine sentence 1 and 2 as one input
        # adding PAD tokens to make the sentence same length as seq_len
        segment_label = ([1 for _ in range(len(t1))] + [2 for _ in range(len(t2))])[:self.seq_len]
        bert_input = (t1 + t2)[:self.seq_len]
        bert_label = (t1_label + t2_label)[:self.seq_len]
        padding = [self.tokenizer.vocab['[PAD]'] for _ in range(self.seq_len - len(bert_input))]
        bert_input.extend(padding), bert_label.extend(padding), segment_label.extend(padding)

        output = {"bert_input": bert_input,
                  "bert_label": bert_label,
                  "segment_label": segment_label,
                  "is_next": is_next_label}

        return {key: torch.tensor(value) for key, value in output.items()}

    def random_word(self, sentence):
        tokens = sentence.split()
        output_label = []
        output = []

        # 15% of the tokens would be replaced
        for i, token in enumerate(tokens):
            prob = random.random()

            # remove cls and sep token
            token_id = self.tokenizer(token)['input_ids'][1:-1]

            if prob < 0.15:
                prob /= 0.15

                # 80% chance change token to mask token
                if prob < 0.8:
                    for i in range(len(token_id)):
                        output.append(self.tokenizer.vocab['[MASK]'])

                # 10% chance change token to random token
                elif prob < 0.9:
                    for i in range(len(token_id)):
                        output.append(random.randrange(len(self.tokenizer.vocab)))

                # 10% chance change token to current token
                else:
                    output.append(token_id)

                output_label.append(token_id)

            else:
                output.append(token_id)
                for i in range(len(token_id)):
                    output_label.append(0)

        # flattening
        output = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output]))
        output_label = list(itertools.chain(*[[x] if not isinstance(x, list) else x for x in output_label]))
        assert len(output) == len(output_label)
        return output, output_label

    def get_sent(self, index):
        '''return random sentence pair'''
        t1, t2 = self.get_corpus_line(index)

        # negative or positive pair, for next sentence prediction
        if random.random() > 0.5:
            return t1, t2, 1
        else:
            return t1, self.get_random_line(), 0

    def get_corpus_line(self, item):
        '''return sentence pair'''
        return self.lines[item][0], self.lines[item][1]

    def get_random_line(self):
        '''return random single sentence'''
        return self.lines[random.randrange(len(self.lines))][1]

In [31]:
# TO DO -> this might apply all the data transformations and embedding before turning it into a dataset, might even serve as a dataloader 
def load_data(dataset='<dataset>', transformation=None, n_train=None, n_test=None):
    # define directory for the chosen dataset
    #data = '/home/space/datasets'+ dataset # for cluster
    data = './datasets/pretraining/' + dataset
    
    line_generator = load_line_pairs(data)  # load lines
    return line_generator
    
    # if transformation = finetuning(): TO DO

In [32]:
# DELETE example usage:
train_data = load_data(dataset='dataset_test.txt', n_train=None, n_test=None)

# Create BERTDataset and DataLoader
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 128

train_dataset = BERTDataset(train_data, seq_len=MAX_LEN, tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, pin_memory=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

TypeError: object of type 'generator' has no len()