# use this script to prepare the dataset offline, once prepared load to GPU server and start training

In [1]:
# install required libraries
# !pip install datasets

In [2]:
import pandas as pd
from datasets import load_dataset
from transformers import  AutoTokenizer
import collections
import math
from torch.utils.data import DataLoader
import yake
import spacy
from datasets import load_dataset, DatasetDict

In [3]:
df = pd.read_csv("train.tsv", delimiter = "\t")

In [4]:
df = df.rename(columns = {"German-de": "German"})
df =df.dropna()
df['English'] = df['English'].apply(lambda x:x.strip())
df['German'] = df['German'].apply(lambda x:x.strip())

In [5]:
# saving to parquet file, easy to load with dataset library
df.to_parquet("train.parquet")

In [6]:
dataset = load_dataset("parquet", data_files = "train.parquet")
dataset = dataset.remove_columns('__index_level_0__')

In [7]:
dataset = dataset['train'].train_test_split(test_size=0.1)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['English', 'German'],
        num_rows: 1754716
    })
    test: Dataset({
        features: ['English', 'German'],
        num_rows: 194969
    })
})


In [9]:
base_model_name = "xlm-roberta-base"
# tokenizing english and other text to half to max_len
max_len_english  = 64
max_len=128
mask_random =  False
m_ratio = 0.15
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

In [10]:
def tokenize_sentence(txt, tokenizer):
    """
    Sentence tokenizer
    """
    result = tokenizer(txt, max_length=max_len_english, padding='max_length', truncation=True)
    word_ids = result.word_ids()
    if tokenizer.is_fast:
        result["word_ids"] = [word_ids[i] for i in range(len(result["input_ids"]))]
    return result

def get_word_mapping(tok):
    """
    once sentence is converted into token, this function maps the word id to token id
    """
    word_ids = tok["word_ids"].copy()
    mapping = collections.defaultdict(list)
    current_word_index = -1
    current_word = None
    for idx, word_id in enumerate(word_ids):
        if word_id is not None:
            if word_id != current_word:
                current_word = word_id
                current_word_index += 1
            mapping[current_word_index].append(idx)
    return mapping

def get_pos_tags(doc):
    """
    From the sentence we get the POS tags, used in masking
    """
    pos_tags = {}
    for token in doc:
        if(not (token.is_stop or token.is_punct or token.is_space or token.text.lower() in stop_words)):
            if(token.tag_ in lst_pos_tags):
                pos_tags[token.text] = token.tag_
    return pos_tags

def get_mask_phrases(txt, tok, mapping, add_pos):
    """
    This function mask the phrases from the sentence
    """
    prev_word = None
    prev_id = None
    next = False
    if(mask_random):
        n_sample = math.ceil(0.15*len(mapping))
        mask = random.sample(range(len(mapping)),n_sample)
        mask_words = []
        for idx in mask:
            start, end = tok.word_to_chars(idx)
            word = txt[start:end].lower()
            mask_words.append(word)
    else:
        yake_doc = txt.replace(tokenizer.eos_token, "")
        yake_doc = yake_doc.replace(tokenizer.bos_token, "")
        yake_doc = yake_doc.strip()
        max_keyword = max(3, math.ceil(m_ratio*len(mapping)))
        keywords = custom_kw_extractor.extract_keywords(yake_doc)[:max_keyword]
        lst_kw = [kw[0].lower() for kw in keywords]
        if(len(lst_kw)<max_keyword and add_pos):
            n = max_keyword-len(lst_kw)
            txt_doc = nlp(txt)
            pos_tags = get_pos_tags(txt_doc)
            for w in pos_tags:
                if(w not in lst_kw):
                    lst_kw.append(w.lower())
                    n = n-1
                    if(n==0):
                        break

        mask = []
        mask_words = []
        for idx in mapping:
            start, end = tok.word_to_chars(idx)
            word = txt[start:end].lower()
            if word in lst_kw or next:
                if prev_word is not None:
                    mask.append(prev_id)
                    mask_words.append(prev_word)
                    mask.append(idx)
                    mask_words.append(word)
                    prev_word = None
                else:
                    mask.append(idx)
                    mask_words.append(word)
                    prev_word = None
                if word in lst_kw:
                    next = True
                else:
                    next = False
            else:
                prev_word = word
                prev_id = idx
                next = False
    return mask, mask_words


def get_mask_words(txt, tok, mapping, add_pos):
    """
    This function mask the words from the sentence
    """
    
    if(mask_random):
        n_sample = math.ceil(0.15*len(mapping))
        mask = random.sample(range(len(mapping)),n_sample)
        mask_words = []
        for idx in mask:
            start, end = tok.word_to_chars(idx)
            word = txt[start:end].lower()
            mask_words.append(word)
    else:
        yake_doc = txt.replace(tokenizer.eos_token, "")
        yake_doc = yake_doc.replace(tokenizer.bos_token, "")
        yake_doc = yake_doc.strip()
        max_keyword = max(3, math.ceil(m_ratio*len(mapping)))
        keywords = custom_kw_extractor.extract_keywords(yake_doc)[:max_keyword]
        lst_kw = [kw[0].lower() for kw in keywords]
        if(len(lst_kw)<max_keyword and add_pos):
            n = max_keyword-len(lst_kw)
            txt_doc = nlp(txt)
            pos_tags = get_pos_tags(txt_doc)
            for w in pos_tags:
                if(w not in lst_kw):
                    #lst_kw.append(w)
                    lst_kw.append(w.lower())
                    n = n-1
                    if(n==0):
                        break

        mask = []
        mask_words = []
        for idx in mapping:
            start, end = tok.word_to_chars(idx)
            word = txt[start:end].lower()
            if word in lst_kw:
                mask.append(idx)
                mask_words.append(word)
    return mask, mask_words

def get_masked_tokens(tokenizer, tok, mapping, mask):
    """
    once we get the mask word id,this function replace with masked tokens
    """
    input_ids = tok["input_ids"].copy()
    labels = [-100]*len(input_ids)
    for word_id in mask:
        for idx in mapping[word_id]:
            labels[idx] = input_ids[idx]
            input_ids[idx] = tokenizer.mask_token_id
    return input_ids, labels

def prepare_features(df):
    """
    helper function to collate function, to prepare the features i.e. input_ids, lablel
    """
    out = {}
    english = df['English']
    german = df['German']
    tok_english = tokenize_sentence(english, tokenizer)
    map_english_words = get_word_mapping(tok_english)
    mask, mask_words = get_mask_words(english, tok_english, map_english_words, False)
    english_masked, label = get_masked_tokens(tokenizer, tok_english, map_english_words, mask)
    tok_german = tokenize_sentence(german, tokenizer)
    german_labels = [-100]*len(tok_german['input_ids'])
    out["input_ids"] = tok_german['input_ids']+english_masked
    out["label"] = german_labels+label
    return out

def collate_mlm_data(features):  
    """
    collate function used in data processing
    """
    batch = {}
    
    lst_input_ids = [f["input_ids"] for f in features]
    lst_labels = [f["label"] for f in features]
    lst_attn_mask = []
    for i in range(len(lst_input_ids)):
        m = len(lst_input_ids[i])
        lst_input_ids[i].extend([tokenizer.pad_token_id]*(max_len-m))
        lst_labels[i].extend([-100]*(max_len-m))
        attention = [1]*m
        attention.extend([0]*(max_len-m))
        lst_attn_mask.append(attention)

    batch["input_ids"] = torch.tensor(lst_input_ids, dtype=torch.long)
    batch["attn_mask"] = torch.tensor(lst_attn_mask, dtype=torch.long)
    batch["labels"] = torch.tensor(lst_labels, dtype=torch.long)
    return batch

In [11]:
# yake parameter to identify the important keywords
top_n = 20
language = "en"
max_ngram_size = 1
deduplication_threshold = 0.9
deduplication_algo = 'seqm'
windowSize = 1
custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=top_n, features=None)
nlp = spacy.load("en_core_web_sm")

In [12]:
tokenized_dataset = dataset.map(prepare_features)

Map:   0%|          | 0/1754716 [00:00<?, ? examples/s]
Map:   0%|          | 0/194969 [00:00<?, ? examples/s]


In [14]:
# saving dataset
tokenized_dataset.save_to_disk("tokenized_dataset")

Saving the dataset (0/7 shards):   0%|          | 0/1754716 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards):   0%|          | 0/194969 [00:00<?, ? examples/s]


In [15]:
# extracting smaller batch from the tokenized dataset,
# smaller dataset is used to test the theory, once it shows the significance, we use larger dataset
train = filtered_tokenized_dataset['train'].filter(lambda example, indice: indice<50000, with_indices=True)
test = filtered_tokenized_dataset['test'].filter(lambda example, indice: indice<5000, with_indices=True)
tokenized_dataset_sample_batch = DatasetDict({"train":train, "test":test})
tokenized_dataset_sample_batch.save_to_disk("tokenized_dataset_sample_batch")

Filter:   0%|          | 0/1754716 [00:00<?, ? examples/s]
Filter:   0%|          | 0/194969 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards):   0%|          | 0/50000 [00:00<?, ? examples/s]
Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]
