In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 2.3MB 8.2MB/s 
[K     |████████████████████████████████| 3.3MB 49.1MB/s 
[K     |████████████████████████████████| 901kB 34.1MB/s 
[K     |████████████████████████████████| 235kB 8.1MB/s 
[K     |████████████████████████████████| 245kB 28.0MB/s 
[K     |████████████████████████████████| 112kB 35.7MB/s 
[?25h

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
root_dir = "/content/drive/My Drive/<Name>/"
os.chdir(root_dir)

In [None]:
import torch
import numpy as np
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

In [None]:
code_dir = "./src/" 

In [None]:
os.chdir(code_dir)
import config, dataset, model, engine
import new_dataset ## our masking scheme 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=2952532.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1434601.0, style=ProgressStyle(descript…




In [None]:
def process_data(fil_text: str, fil_lid: str):
    """
    Get sentences from files and create a list for dataset
    creation.

    There should be a file called fil + ".tgd" which maintains
    a parallel list of tags.

    Args:
        fil (str): File name of the code-mix sentences
    Return:
        texts (list), lidtags (list)
    """

    with open(fil_text,"r") as fh:  
        data = fh.read()
    
    print("Read data file")

    with open(fil_lid,"r") as fh: 
        tags = fh.read()

    print("Read tags file")

    data = data.split("\n")
    tags = tags.split("\n")

    # Remove extra new line at the end
    data = data[:-1]
    tags = tags[:-1]

    # texts = [text.split() for text in data]
    # lidtags = [senttag.split() for senttag in tags]

    return data, tags

In [None]:
data_dir = "/content/drive/My Drive/<Name>"

os.chdir(root_dir)

train_text_file = data_dir + "<file_name>"
train_lid_file = data_dir + "<file_name>"

val_text_file = data_dir + "<file_name>"
val_lid_file = data_dir + "<file_name>"

test_text_file = data_dir + "<file_name>"
test_lid_file = data_dir + "<file_name>"

In [None]:
train_texts, train_lidtags = process_data(train_text_file, train_lid_file)
val_texts, val_lidtags = process_data(val_text_file, val_lid_file)
test_texts, test_lidtags = process_data(test_text_file, test_lid_file)

Read data file
Read tags file
Read data file
Read tags file
Read data file
Read tags file


## Training Custom Tokenizer [(HuggingFace)](https://huggingface.co/docs/tokenizers/python/latest/quicktour.html)


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

### To train

In [None]:
tokenizer = Tokenizer(BPE()) ## Instialize a ## Byte-Pair Encoding Strategy 

In [None]:
tokenizer.pre_tokenizer = Whitespace() ## split on whitespace

In [None]:
## Initialize the tokenizer trainer (specify the special tokens to include)
trainer = BpeTrainer(vocab_size=30000, show_progress=True, special_tokens=["<s>", "</s>", "<pad>", "<unk>", "<mask>"]) 
'''
vocab_size (int, optional) – The size of the final vocabulary, including all tokens and alphabet.

min_frequency (int, optional) – The minimum frequency a pair should have in order to be merged.

show_progress (bool, optional) – Whether to show progress bars while training.

special_tokens (List[Union[str, AddedToken]], optional) – A list of special tokens the model should know of.

limit_alphabet (int, optional) – The maximum different characters to keep in the alphabet.

initial_alphabet (List[str], optional) – A list of characters to include in the initial alphabet, even if not seen in the training dataset. If the strings contain more than one character, only the first one is kept.

continuing_subword_prefix (str, optional) – A prefix to be used for every subword that is not a beginning-of-word.

end_of_word_suffix (str, optional) – A suffix to be used for every subword that is a end-of-word. 
'''

'\nvocab_size (int, optional) – The size of the final vocabulary, including all tokens and alphabet.\n\nmin_frequency (int, optional) – The minimum frequency a pair should have in order to be merged.\n\nshow_progress (bool, optional) – Whether to show progress bars while training.\n\nspecial_tokens (List[Union[str, AddedToken]], optional) – A list of special tokens the model should know of.\n\nlimit_alphabet (int, optional) – The maximum different characters to keep in the alphabet.\n\ninitial_alphabet (List[str], optional) – A list of characters to include in the initial alphabet, even if not seen in the training dataset. If the strings contain more than one character, only the first one is kept.\n\ncontinuing_subword_prefix (str, optional) – A prefix to be used for every subword that is not a beginning-of-word.\n\nend_of_word_suffix (str, optional) – A suffix to be used for every subword that is a end-of-word. \n'

In [None]:
## Train the tokenizer on the data 
tokenizer.train(files=[train_text_file], trainer=trainer)

In [None]:
tokenizer.get_vocab_size()

30000

### Load Tokenizer

In [None]:
## Save/ Load the custom tokenizer
tokenizer.save("/content/drive/My Drive/<name>/tokenizer-cm.json")
# tokenizer = Tokenizer.from_file("/content/drive/My Drive/<name>/tokenizer-cm.json")

In [None]:
## Check soem examples 
tokenizer.decode([382, 1, 2, 3, 4], skip_special_tokens=False)

'aaj </s> <pad> <unk> <mask>'

In [None]:
## Specify the mask token id to be used in MLM training 
tokenizer.mask_token_id = tokenizer.encode('<mask>').ids[0]

In [None]:
tokenizer.encode("<s> </s> <pad> <unk> <mask>").ids ## Confirm special token ids 

[0, 1, 2, 3, 4]

In [None]:
## Example: token, and token ids 
output = tokenizer.encode("ye toh honahai")
print(output.ids, output.tokens)

[75, 326, 861, 65] ['ye', 'toh', 'hona', 'hai']


In [None]:
## Save the ids corresponding to the languages (here english-0 and hindi-1)
tokenizer.lang2id = {}
tokenizer.lang2id["en"] = 0
tokenizer.lang2id["hi"] = 1

In [None]:
tokenizer.lang2id

{'en': 0, 'hi': 1}

In [None]:
## Pre-processing and creating dataset object. 
train_texts, train_lidtags = process_data(train_text_file, train_lid_file)
# train_dataset = dataset.LIDdataset(train_texts, train_lidtags, tokenizer)
train_dataset = dataset.LIDdataset(train_texts[:329214], train_lidtags[:329214], tokenizer)

Read data file
Read tags file


In [None]:
len(train_texts)

9876425

### Comparison of tokenizers


In [None]:
train_dataset_new = new_dataset.LIDdataset(train_texts, train_lidtags, tokenizer) ## Load a dataset using modified tokenizer (weighted sampling frequency)

In [None]:
train_dataset_old = dataset.LIDdataset(train_texts, train_lidtags, tokenizer) ## Load a dataset using a normal tokenizer

In [None]:
tokenized_text = tokenizer.encode(" ".join([text.strip() for text in eval(train_texts[312])])).tokens
print(tokenized_text)
print(train_dataset_new.__getitem__(312)['langs'].tolist()[1:len(tokenized_text)+1])

['wish', 'u', 'a', 'very', 'long', 'life', 'spna', 'g', 'bcz', 'i', 'm', 'your', 'big', 'fan']
[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0]


In [None]:
train_dataset_new.__getitem__(312)

{'en': 0.15, 'hi': 0.21428571428571427}


{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([    0,   506,    39,    19,   207,  1126,   550, 17696,    25,  2443,
            27,    31,   259,   389,   291,     1,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     

In [None]:
train_dataset_old.__getitem__(10)

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([   0,  382,    4, 9902,  416,   59,  444,    4,  436,   71,   35,    1,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0

In [None]:
import random 

In [None]:
nb_exps = 1000 ## number of experiments 

for sample_no in random.sample(range(1, len(train_texts)), 5): ## Take at max 5 samples from the dataset 

    token_len = len(tokenizer.encode(train_texts[sample_no]).ids) 
    old_count = [0]*token_len
    new_count = [0]*token_len

    for i in range(nb_exps): ## Loop over the number of experiments 
        lang_ids = old = train_dataset_old.__getitem__(sample_no)['langs'].tolist() ## Simple masking

        old = train_dataset_old.__getitem__(sample_no)['input_ids'].tolist() ## Simple masking
        new = train_dataset_new.__getitem__(sample_no)['input_ids'].tolist() ## Weighted Masking

        for idx in range(1, token_len+1):
            if old[idx] == 4: ## Old token masked
                old_count[idx-1] += 1
            if new[idx] == 4: ## Old token masked
                new_count[idx-1] += 1

    tokenized_text = tokenizer.encode(" ".join([text.strip() for text in eval(train_texts[sample_no])])).tokens
    print(tokenized_text)

    ## Display results (difference in masking probability)
    print(f"{'Token':^12} | {'Lang Id':^7} | {'Old Prob':^12} | {'New Prob':^10} |")
    print("-"*70)
    for idx, token in enumerate(tokenized_text):
        print(f"{token:^12} | {lang_ids[idx+1]:^7} | {old_count[idx]/nb_exps:^12} | {new_count[idx]/nb_exps:^10} |")


['ha', 'ha', 'kejriwal', 'the', 'great', 'indian', 'unk', 'show', 'aap', 'comes', 'on', 'tv', 'to', 'get', 'd', 'trp', 'by', 'saying', 'anything', 'completely', 'mockery', 'stuff', 'some', 'time']
   Token     | Lang Id |   Old Prob   |  New Prob  |
----------------------------------------------------------------------
     ha      |    1    |    0.131     |   0.165    |
     ha      |    1    |     0.11     |   0.188    |
  kejriwal   |    1    |    0.113     |   0.204    |
    the      |    0    |    0.146     |    0.14    |
   great     |    0    |    0.125     |   0.122    |
   indian    |    0    |    0.123     |   0.108    |
    unk      |    0    |    0.135     |   0.127    |
    show     |    0    |    0.119     |   0.133    |
    aap      |    1    |    0.117     |   0.187    |
   comes     |    0    |    0.115     |   0.134    |
     on      |    0    |    0.131     |   0.092    |
     tv      |    1    |    0.124     |   0.194    |
     to      |    0    |    0.122     |   0

In [None]:
old_count, tokenizer.encode(train_texts[sample_no]).ids

([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [48, 59, 45, 5, 78261, 45, 5, 8504, 45, 5, 2763, 45, 5, 19, 45, 5, 73797, 47])

In [None]:
tokenizer.decode(train_dataset_old.__getitem__(123)['input_ids'].tolist(), skip_special_tokens=False) ## Simple masking

'<s> truck driver ko <mask> padega 3 live ko bach s dis </s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s>'

In [None]:
tokenizer.decode(train_dataset_new.__getitem__(123)['input_ids'].tolist(), skip_special_tokens=False) ## Weighted Masking

'<s> truck driver ko manna padega 3 live ko <mask> s dis </s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s> <s>'

In [None]:
train_texts[123]

"['truck', 'driver', 'ko', 'manna', 'padega', '3', 'live', 'ko', 'bach', 's', 'dis']"