API's Needed

In [1]:
!pip install transformers sentencepiece datasets

[0m

Imports

In [2]:
# Hugging face transformers to download pretrained model and tokenizer
import transformers
# Hugging face datasets to download the dataset
import datasets
# Pytorch for tensor
import torch
# For ploting the graph
import matplotlib.pyplot as plt
# Basic arithmatic operations
import numpy as np
# To show the progress bar
import tqdm
# For data handling
import itertools
# For testing model
import copy
# Specific functions from the libraries
from datasets import load_dataset
from transformers import AdamW, AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import get_linear_schedule_with_warmup

# Model
Here we download the pre-trained model and tokenizer

In [3]:
# model repository
model_repo = 'google/mt5-base'
# download mt5 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_repo)
# download model
model= AutoModelForSeq2SeqLM.from_pretrained(model_repo)
# puts model onto GPU
model = model.cuda()

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

# Dataset
Here we will be defining the dataset and downloading it

In [4]:
# Loading the dataset
dataset = load_dataset('alt')
# split the dataset into train validation and test
train_dataset = dataset['train']
validation_dataset = dataset['validation']
test_dataset = dataset['test']

Downloading readme:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/31.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.79M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/18088 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1019 [00:00<?, ? examples/s]

Adding the special tokens

In [5]:
# add language token mapping to the tokenizer
LANG_TOKEN_MAPPING = {
    'en' : '<en>',
    'fil' : '<fil>',
    'hi' : '<hi>',
    'id' : '<id>',
    'ja' : '<ja>', 
}
# create a dict of the dict
special_tokens = { 'additional_special_tokens': list(LANG_TOKEN_MAPPING.values()) }
# add special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)
# resize the token embeddings layer to correct size
model.resize_token_embeddings(len(tokenizer))

Embedding(250105, 768)

# Data Handling
Functions to handle the data

In [6]:
# tokenizes and numericalizes input string
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# tokenizes and numericalizes target string
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

# get all translations between all permutations of pairs of languages
def get_all_translation_data(translations, lang_token_map,
                            tokenizer, seq_length=20):
  input_ids = []
  target_ids = []
  
  langs = list(lang_token_map.keys())
  for input_lang, target_lang in itertools.permutations(langs, 2):
    input_text = translations[input_lang]
    target_text = translations[target_lang]
    
    if input_text is None or target_text is None:
        return None, None
    
    input_ids.append(encode_input_str(input_text, target_lang, tokenizer, seq_length, 
                                    lang_token_map))
    
    target_ids.append(encode_target_str(target_text, tokenizer, seq_length))
  
  return input_ids, target_ids

# generator function
def get_full_dataloader(dataset, lang_token_map, tokenizer, batch_size=32, num_workers=8):
    # get translations from the dataset
    dataset = train_dataset['translation']
    # intialize array
    data = []
    for example in dataset:
        # get translations for all permuations of languages
        input_id, target_id = get_all_translation_data(example, lang_token_map, tokenizer)
        # case where nothing is returned
        if input_id is None or target_id is None:
            continue
        # add the list of target and inputs 
        list_of_dicts = list(map(lambda x, y: {'input_ids': x, 'target_ids': y}, input_id, target_id))
        data = data + list_of_dicts
    # load dataset into a dataloader
    loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    # return the dataloader
    return loader

In [7]:
# tokenizes and numericalizes input string
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]

# tokenizes and numericalizes target string
def encode_target_str(text, tokenizer, seq_len):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

# improvement would be do this for all permutations of languages 
# or at least more than once per example
def format_translation_data(translations, lang_token_map,
                            tokenizer, seq_length=20):
  # choose 2 random languages for i/o
  langs = list(lang_token_map.keys())
  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)
  input_text = translations[input_lang]
  target_text = translations[target_lang]
  
  if input_text is None or target_text is None:
    return None, None
  
  input_ids = encode_input_str(input_text, target_lang, tokenizer, seq_length, 
                                lang_token_map)
  
  target_ids = encode_target_str(target_text, tokenizer, seq_length)
  
  return input_ids, target_ids

# gets a random batch of translations b/w two languages
def transform_batch(batch, lang_token_map, tokenizer, seq_length=20):
  input_ids = []
  target_ids = []
  
  for example in batch['translation']:
      input_id, target_id = format_translation_data(example, lang_token_map, tokenizer)
      
      if input_id is not None:
          input_ids.append(input_id)
          target_ids.append(target_id)
  
  input_ids = torch.stack(input_ids).cuda()
  target_ids = torch.stack(target_ids).cuda()
  
  return input_ids, target_ids

# generator function
def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  
  for i in range(0, len(dataset), batch_size):
      batch = dataset[i:i+batch_size]
      yield transform_batch(batch, lang_token_map, tokenizer)

def get_dataloader(dataset, lang_token_map, tokenizer, batch_size=32):
  dataset = dataset.shuffle()
  dataset = dataset.map(lambda batch: transform_batch(batch, lang_token_map, tokenizer), batched=True)
  dataset.set_format(type='torch', columns=['input_ids', 'target_ids'])
  data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)
  
  return data_loader



# Training

Evaluation function for validation

In [8]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [9]:
# Hyperparameters
EPOCHS = 5
batch_size = 64
learning_rate = 5e-3
n_batches = np.ceil(len(train_dataset) * 20 / batch_size)
total_steps = n_batches * EPOCHS
print_freq = int(total_steps / 100)
checkpoint_freq = int(total_steps / 33)
n_warmup_steps = int(0.01 * total_steps)

In [10]:
print(print_freq, checkpoint_freq, n_warmup_steps)

282 856 282


In [11]:
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
schedular = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, total_steps)



Training the model using transfer learning

In [12]:
loss_i = []

best_loss = float("inf")

for epoch in range(EPOCHS):
    # Turn parallelism on
    os.environ["TOKENIZER_PARALLELISM"] = "true"
    # Randomize data order, need to figure out a faster way to do this
    loader = get_full_dataloader(train_dataset, LANG_TOKEN_MAPPING, tokenizer, batch_size, num_workers=8)
    
    for i, batch in tqdm.tqdm(enumerate(loader), total = n_batches):
        inputs, targets = batch['input_ids'].cuda(), batch['target_ids'].cuda()
        
        # Zero gradients
        optimizer.zero_grad()
        # Forward pass (computes outputs and loss)
        output = model(input_ids=inputs, labels=targets)
        loss = output.loss
        # Back propagation (computes gradients)
        loss.backward()
        # Optimization and scheduling
        optimizer.step()
        # Adjust every 100 batches
        if(i+1) % 250 == 0:
            loss_i.append(loss.item())
            schedular.step()
        # prints training updates
        if (i+1) % print_freq == 0:
            print(f'Epoch: {epoch + 1}, Batch: {i+1}/{n_batches}, Loss: {loss.item()}, LR: {schedular.get_last_lr()[0]}')
        
        if (i + 1) % checkpoint_freq == 0:
            test_loss = eval_model(copy.deepcopy(model), test_dataset)
            if(test_loss < best_loss):
                print('Saving model with test loss of {:.3f}'.format(test_loss))
                torch.save(model.state_dict(), 'mt5_translator_best.pt')
                best_loss = test_loss
# Save the final model
torch.save(model.state_dict(), 'mt5_translator_final2.pt')
plt.plot(loss_i)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch: 1, Batch: 282/5653.0, Loss: 12.140501022338867, LR: 1.773049645390071e-05


 10%|▉         | 564/5653.0 [03:20<30:30,  2.78it/s]

Epoch: 1, Batch: 564/5653.0, Loss: 6.736814975738525, LR: 3.546099290780142e-05


 15%|█▍        | 846/5653.0 [05:01<29:59,  2.67it/s]

Epoch: 1, Batch: 846/5653.0, Loss: 4.800169944763184, LR: 5.319148936170213e-05


 15%|█▌        | 855/5653.0 [05:05<28:55,  2.76it/s]

Saving model with test loss of 5.003


 20%|█▉        | 1128/5653.0 [06:49<28:03,  2.69it/s] 

Epoch: 1, Batch: 1128/5653.0, Loss: 4.242094993591309, LR: 7.092198581560284e-05


 25%|██▍       | 1410/5653.0 [08:32<26:22,  2.68it/s]

Epoch: 1, Batch: 1410/5653.0, Loss: 3.6015923023223877, LR: 8.865248226950355e-05


 30%|██▉       | 1692/5653.0 [10:15<24:18,  2.72it/s]

Epoch: 1, Batch: 1692/5653.0, Loss: 3.2940170764923096, LR: 0.00010638297872340425


 30%|███       | 1711/5653.0 [10:22<23:58,  2.74it/s]

Saving model with test loss of 3.337


 35%|███▍      | 1974/5653.0 [12:04<22:38,  2.71it/s]  

Epoch: 1, Batch: 1974/5653.0, Loss: 3.2979331016540527, LR: 0.00012411347517730497


 40%|███▉      | 2256/5653.0 [13:46<20:55,  2.70it/s]

Epoch: 1, Batch: 2256/5653.0, Loss: 2.944849729537964, LR: 0.00015957446808510637


 45%|████▍     | 2538/5653.0 [15:30<19:20,  2.68it/s]

Epoch: 1, Batch: 2538/5653.0, Loss: 2.7269887924194336, LR: 0.0001773049645390071


 45%|████▌     | 2567/5653.0 [15:40<19:05,  2.69it/s]

Saving model with test loss of 2.902


 50%|████▉     | 2820/5653.0 [17:18<17:15,  2.74it/s]  

Epoch: 1, Batch: 2820/5653.0, Loss: 2.7522852420806885, LR: 0.0001950354609929078


 55%|█████▍    | 3102/5653.0 [19:01<15:43,  2.70it/s]

Epoch: 1, Batch: 3102/5653.0, Loss: 2.521615505218506, LR: 0.0002127659574468085


 60%|█████▉    | 3384/5653.0 [20:43<14:01,  2.70it/s]

Epoch: 1, Batch: 3384/5653.0, Loss: 2.5124526023864746, LR: 0.0002304964539007092


 61%|██████    | 3423/5653.0 [20:57<13:29,  2.76it/s]

Saving model with test loss of 2.715


 65%|██████▍   | 3666/5653.0 [22:32<12:12,  2.71it/s]  

Epoch: 1, Batch: 3666/5653.0, Loss: 2.2595038414001465, LR: 0.00024822695035460994


 70%|██████▉   | 3948/5653.0 [24:14<10:32,  2.70it/s]

Epoch: 1, Batch: 3948/5653.0, Loss: 2.3360979557037354, LR: 0.00026595744680851064


 75%|███████▍  | 4230/5653.0 [25:57<08:45,  2.71it/s]

Epoch: 1, Batch: 4230/5653.0, Loss: 2.1400771141052246, LR: 0.00028368794326241134


 76%|███████▌  | 4279/5653.0 [26:14<08:12,  2.79it/s]

Saving model with test loss of 2.501


 80%|███████▉  | 4512/5653.0 [27:45<06:55,  2.75it/s]

Epoch: 1, Batch: 4512/5653.0, Loss: 2.2176709175109863, LR: 0.00031914893617021275


 85%|████████▍ | 4794/5653.0 [29:27<05:17,  2.71it/s]

Epoch: 1, Batch: 4794/5653.0, Loss: 1.9755065441131592, LR: 0.0003368794326241135


 90%|████████▉ | 5076/5653.0 [31:10<03:37,  2.65it/s]

Epoch: 1, Batch: 5076/5653.0, Loss: 2.210618257522583, LR: 0.0003546099290780142


 91%|█████████ | 5135/5653.0 [31:31<03:08,  2.75it/s]

Saving model with test loss of 2.450


 95%|█████████▍| 5358/5653.0 [32:57<01:48,  2.72it/s]

Epoch: 1, Batch: 5358/5653.0, Loss: 1.9281156063079834, LR: 0.0003723404255319149


100%|█████████▉| 5640/5653.0 [34:40<00:04,  2.74it/s]

Epoch: 1, Batch: 5640/5653.0, Loss: 2.172295093536377, LR: 0.0003900709219858156


100%|█████████▉| 5651/5653.0 [34:44<00:00,  2.71it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been u

Epoch: 2, Batch: 282/5653.0, Loss: 1.8628990650177002, LR: 0.00040780141843971637


 10%|▉         | 564/5653.0 [03:19<30:43,  2.76it/s]

Epoch: 2, Batch: 564/5653.0, Loss: 1.956827163696289, LR: 0.000425531914893617


 15%|█▍        | 846/5653.0 [05:00<29:11,  2.74it/s]

Epoch: 2, Batch: 846/5653.0, Loss: 1.9920841455459595, LR: 0.0004432624113475178


 15%|█▌        | 855/5653.0 [05:03<28:34,  2.80it/s]

Saving model with test loss of 2.386


 20%|█▉        | 1128/5653.0 [06:48<27:42,  2.72it/s] 

Epoch: 2, Batch: 1128/5653.0, Loss: 1.6439886093139648, LR: 0.0004609929078014184


 25%|██▍       | 1410/5653.0 [08:30<26:46,  2.64it/s]

Epoch: 2, Batch: 1410/5653.0, Loss: 1.6560373306274414, LR: 0.0004787234042553192


 30%|██▉       | 1692/5653.0 [10:12<24:03,  2.74it/s]

Epoch: 2, Batch: 1692/5653.0, Loss: 1.664449691772461, LR: 0.0004964539007092199


 30%|███       | 1711/5653.0 [10:19<23:41,  2.77it/s]

Saving model with test loss of 2.338


 35%|███▍      | 1974/5653.0 [12:00<22:30,  2.73it/s]  

Epoch: 2, Batch: 1974/5653.0, Loss: 1.7545270919799805, LR: 0.0005141843971631205


 40%|███▉      | 2256/5653.0 [13:43<20:37,  2.74it/s]

Epoch: 2, Batch: 2256/5653.0, Loss: 1.6978660821914673, LR: 0.000549645390070922


 45%|████▍     | 2538/5653.0 [15:25<19:11,  2.71it/s]

Epoch: 2, Batch: 2538/5653.0, Loss: 1.6895267963409424, LR: 0.0005673758865248227


 50%|████▉     | 2820/5653.0 [17:09<17:31,  2.69it/s]

Epoch: 2, Batch: 2820/5653.0, Loss: 1.6187365055084229, LR: 0.0005851063829787234


 55%|█████▍    | 3102/5653.0 [18:51<15:43,  2.70it/s]

Epoch: 2, Batch: 3102/5653.0, Loss: 1.4885663986206055, LR: 0.0006028368794326241


 60%|█████▉    | 3384/5653.0 [20:33<14:05,  2.68it/s]

Epoch: 2, Batch: 3384/5653.0, Loss: 1.7999649047851562, LR: 0.0006205673758865249


 61%|██████    | 3423/5653.0 [20:47<13:47,  2.69it/s]

Saving model with test loss of 2.333


 65%|██████▍   | 3666/5653.0 [22:22<12:34,  2.63it/s]  

Epoch: 2, Batch: 3666/5653.0, Loss: 1.5468676090240479, LR: 0.0006382978723404255


 70%|██████▉   | 3948/5653.0 [24:04<10:30,  2.71it/s]

Epoch: 2, Batch: 3948/5653.0, Loss: 1.4913159608840942, LR: 0.0006560283687943263


 75%|███████▍  | 4230/5653.0 [25:46<09:07,  2.60it/s]

Epoch: 2, Batch: 4230/5653.0, Loss: 1.5673550367355347, LR: 0.000673758865248227


 80%|███████▉  | 4512/5653.0 [27:29<06:57,  2.73it/s]

Epoch: 2, Batch: 4512/5653.0, Loss: 1.6758511066436768, LR: 0.0007092198581560284


 85%|████████▍ | 4794/5653.0 [29:11<05:19,  2.69it/s]

Epoch: 2, Batch: 4794/5653.0, Loss: 1.3846328258514404, LR: 0.0007269503546099291


 90%|████████▉ | 5076/5653.0 [30:54<03:31,  2.73it/s]

Epoch: 2, Batch: 5076/5653.0, Loss: 1.4943244457244873, LR: 0.0007446808510638298


 95%|█████████▍| 5358/5653.0 [32:37<01:48,  2.71it/s]

Epoch: 2, Batch: 5358/5653.0, Loss: 1.5182760953903198, LR: 0.0007624113475177305


100%|█████████▉| 5640/5653.0 [34:19<00:04,  2.70it/s]

Epoch: 2, Batch: 5640/5653.0, Loss: 1.3708231449127197, LR: 0.0007801418439716312


100%|█████████▉| 5651/5653.0 [34:24<00:00,  2.74it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been u

Epoch: 3, Batch: 282/5653.0, Loss: 1.0134413242340088, LR: 0.000797872340425532


 10%|▉         | 564/5653.0 [03:20<30:50,  2.75it/s]

Epoch: 3, Batch: 564/5653.0, Loss: 1.2907130718231201, LR: 0.0008156028368794327


 15%|█▍        | 846/5653.0 [05:00<29:22,  2.73it/s]

Epoch: 3, Batch: 846/5653.0, Loss: 1.1860761642456055, LR: 0.0008333333333333333


 20%|█▉        | 1128/5653.0 [06:44<27:27,  2.75it/s] 

Epoch: 3, Batch: 1128/5653.0, Loss: 1.0621111392974854, LR: 0.000851063829787234


 25%|██▍       | 1410/5653.0 [08:26<26:18,  2.69it/s]

Epoch: 3, Batch: 1410/5653.0, Loss: 1.201399564743042, LR: 0.0008687943262411348


 30%|██▉       | 1692/5653.0 [10:08<24:19,  2.71it/s]

Epoch: 3, Batch: 1692/5653.0, Loss: 1.1840527057647705, LR: 0.0008865248226950355


 35%|███▍      | 1974/5653.0 [11:52<22:42,  2.70it/s]

Epoch: 3, Batch: 1974/5653.0, Loss: 1.2507548332214355, LR: 0.0009042553191489362


 40%|███▉      | 2256/5653.0 [13:34<20:54,  2.71it/s]

Epoch: 3, Batch: 2256/5653.0, Loss: 1.2636125087738037, LR: 0.0009397163120567376


 45%|████▍     | 2538/5653.0 [15:16<19:10,  2.71it/s]

Epoch: 3, Batch: 2538/5653.0, Loss: 1.0686026811599731, LR: 0.0009574468085106384


 50%|████▉     | 2820/5653.0 [16:59<17:12,  2.75it/s]

Epoch: 3, Batch: 2820/5653.0, Loss: 1.0814588069915771, LR: 0.000975177304964539


 55%|█████▍    | 3102/5653.0 [18:41<15:55,  2.67it/s]

Epoch: 3, Batch: 3102/5653.0, Loss: 1.0995972156524658, LR: 0.0009929078014184398


 60%|█████▉    | 3384/5653.0 [20:24<14:01,  2.70it/s]

Epoch: 3, Batch: 3384/5653.0, Loss: 1.2643203735351562, LR: 0.0010106382978723404


 65%|██████▍   | 3666/5653.0 [22:07<12:04,  2.74it/s]

Epoch: 3, Batch: 3666/5653.0, Loss: 1.1373530626296997, LR: 0.001028368794326241


 70%|██████▉   | 3948/5653.0 [23:49<10:22,  2.74it/s]

Epoch: 3, Batch: 3948/5653.0, Loss: 1.199615716934204, LR: 0.001046099290780142


 75%|███████▍  | 4230/5653.0 [25:32<08:46,  2.70it/s]

Epoch: 3, Batch: 4230/5653.0, Loss: 1.1200330257415771, LR: 0.0010638297872340426


 80%|███████▉  | 4512/5653.0 [27:16<06:59,  2.72it/s]

Epoch: 3, Batch: 4512/5653.0, Loss: 1.0683257579803467, LR: 0.001099290780141844


 85%|████████▍ | 4794/5653.0 [28:58<05:17,  2.71it/s]

Epoch: 3, Batch: 4794/5653.0, Loss: 1.0612446069717407, LR: 0.0011170212765957447


 90%|████████▉ | 5076/5653.0 [30:40<03:38,  2.64it/s]

Epoch: 3, Batch: 5076/5653.0, Loss: 0.9015735387802124, LR: 0.0011347517730496454


 95%|█████████▍| 5358/5653.0 [32:24<01:52,  2.63it/s]

Epoch: 3, Batch: 5358/5653.0, Loss: 1.0412013530731201, LR: 0.001152482269503546


100%|█████████▉| 5640/5653.0 [34:06<00:04,  2.62it/s]

Epoch: 3, Batch: 5640/5653.0, Loss: 1.0229183435440063, LR: 0.001170212765957447


100%|█████████▉| 5651/5653.0 [34:11<00:00,  2.75it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been u

Epoch: 4, Batch: 282/5653.0, Loss: 0.8185732960700989, LR: 0.0011879432624113475


 10%|▉         | 564/5653.0 [03:20<30:36,  2.77it/s]

Epoch: 4, Batch: 564/5653.0, Loss: 0.8239415287971497, LR: 0.0012056737588652482


 15%|█▍        | 846/5653.0 [05:00<29:03,  2.76it/s]

Epoch: 4, Batch: 846/5653.0, Loss: 0.715688169002533, LR: 0.0012234042553191488


 20%|█▉        | 1128/5653.0 [06:43<27:33,  2.74it/s] 

Epoch: 4, Batch: 1128/5653.0, Loss: 0.743182361125946, LR: 0.0012411347517730497


 25%|██▍       | 1410/5653.0 [08:25<26:14,  2.69it/s]

Epoch: 4, Batch: 1410/5653.0, Loss: 0.7897540926933289, LR: 0.0012588652482269504


 30%|██▉       | 1692/5653.0 [10:06<25:29,  2.59it/s]

Epoch: 4, Batch: 1692/5653.0, Loss: 0.7315928936004639, LR: 0.001276595744680851


 35%|███▍      | 1974/5653.0 [11:50<22:18,  2.75it/s]

Epoch: 4, Batch: 1974/5653.0, Loss: 0.6624656915664673, LR: 0.0012943262411347516


 40%|███▉      | 2256/5653.0 [13:31<20:41,  2.74it/s]

Epoch: 4, Batch: 2256/5653.0, Loss: 0.6945327520370483, LR: 0.0013297872340425532


 45%|████▍     | 2538/5653.0 [15:13<19:16,  2.69it/s]

Epoch: 4, Batch: 2538/5653.0, Loss: 0.691102921962738, LR: 0.001347517730496454


 50%|████▉     | 2820/5653.0 [16:56<17:20,  2.72it/s]

Epoch: 4, Batch: 2820/5653.0, Loss: 0.7127852439880371, LR: 0.0013652482269503547


 60%|█████▉    | 3384/5653.0 [20:19<13:41,  2.76it/s]

Epoch: 4, Batch: 3384/5653.0, Loss: 0.6967794895172119, LR: 0.0014007092198581562


 65%|██████▍   | 3666/5653.0 [22:01<12:12,  2.71it/s]

Epoch: 4, Batch: 3666/5653.0, Loss: 0.735650897026062, LR: 0.0014184397163120568


 70%|██████▉   | 3948/5653.0 [23:43<10:20,  2.75it/s]

Epoch: 4, Batch: 3948/5653.0, Loss: 0.773854672908783, LR: 0.0014361702127659577


 75%|███████▍  | 4230/5653.0 [25:24<08:50,  2.68it/s]

Epoch: 4, Batch: 4230/5653.0, Loss: 0.7362936735153198, LR: 0.0014539007092198581


 80%|███████▉  | 4512/5653.0 [27:07<06:58,  2.72it/s]

Epoch: 4, Batch: 4512/5653.0, Loss: 0.6771976947784424, LR: 0.0014893617021276596


 85%|████████▍ | 4794/5653.0 [28:49<05:12,  2.75it/s]

Epoch: 4, Batch: 4794/5653.0, Loss: 0.6197512149810791, LR: 0.0015070921985815603


 87%|████████▋ | 4895/5653.0 [29:25<04:31,  2.79it/s]

: 