In [1]:
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("log.txt"),
        logging.StreamHandler()
    ]
)

In [2]:
from transformers import AutoTokenizer
import multiprocessing


tokenizer = AutoTokenizer.from_pretrained(f'/home/mohammad/Tokenizers/tokenizers/custom_bert_tokenizer')
num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")


The max length for the tokenizer is: 512


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('text_col.csv')

In [5]:
text_list = list(df['text'].values)

In [6]:
text_list = [str(text) for text in text_list]

In [7]:
raw_data = {'text':text_list}

In [8]:
num_proc = multiprocessing.cpu_count()

In [9]:
def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples['text'], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

In [10]:
from datasets import Dataset

In [11]:
dataset = Dataset.from_dict(raw_data)

In [12]:
dataset = dataset.train_test_split(
                        test_size=0.2)

In [13]:
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)

Map (num_proc=24):   0%|          | 0/3724 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/931 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

In [15]:
from transformers import Trainer, TrainingArguments

2023-08-06 05:04:27,855 [INFO] Created a temporary directory at /tmp/tmpv62grc_2
2023-08-06 05:04:27,856 [INFO] Writing /tmp/tmpv62grc_2/_remote_module_non_scriptable.py


In [16]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [17]:
vocab_size = 30522
max_length = 512

In [18]:
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

Generate config GenerationConfig {
  "_from_model_config": true,
  "pad_token_id": 0,
  "transformers_version": "4.31.0"
}



In [19]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [20]:
model_path = 'models/'

In [25]:
training_args = TrainingArguments(
    "test_trainer",
    evaluation_strategy="epoch",  # to evaluate model and get metrics after each epoch
    logging_strategy="epoch",  # to log metrics after each epoch
    save_strategy="epoch",  # to save model after each epoch
    per_device_train_batch_size=8,
    learning_rate=2e-2,
    num_train_epochs=3,   
    logging_dir='./logs', 
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [26]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 3724
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
        num_rows: 931
    })
})

In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
)

In [28]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3,724
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Training with DataParallel so batch size has been adjusted to: 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 699
  Number of trainable parameters = 109,514,298
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,8.2331,7.051541
2,7.021,7.004931
3,6.978,6.982363


The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 931
  Batch size = 16
Saving model checkpoint to test_trainer/checkpoint-233
Configuration saved in test_trainer/checkpoint-233/config.json
Configuration saved in test_trainer/checkpoint-233/generation_config.json
Model weights saved in test_trainer/checkpoint-233/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 931
  Batch size = 16
Saving model checkpoint to test_trainer/checkpoint-466
Configuration saved in t

TrainOutput(global_step=699, training_loss=7.410704082003308, metrics={'train_runtime': 552.3585, 'train_samples_per_second': 20.226, 'train_steps_per_second': 1.265, 'total_flos': 2940524168601600.0, 'train_loss': 7.410704082003308, 'epoch': 3.0})

In [40]:
training_history=trainer.state.log_history

In [61]:
valid_losses = []
train_losses = []
train_time = 0.0
epochs = []
lr = []
for history_dict in training_history:
    
    try:
        if 'eval_loss' in history_dict.keys():
            valid_loss = history_dict['eval_loss']
            valid_losses.append(valid_loss)
        elif 'loss' in history_dict.keys():
            train_loss = history_dict['loss']
            epochs.append(history_dict['epoch'])
            train_losses.append(train_loss)
            lr.append(history_dict['learning_rate'])
        elif 'train_runtime' in history_dict.keys():
            train_time = history_dict['train_runtime']
    except Exception as e:
        print(f'Something error {e}')

In [63]:
valid_losses,train_losses,train_time,epochs,lr

([7.051540851593018, 7.0049309730529785, 6.982363224029541],
 [8.2331, 7.021, 6.978],
 552.3585,
 [1.0, 2.0, 3.0],
 [0.013333333333333332, 0.006666666666666666, 0.0])

In [64]:
train_times = [train_time/len(valid_losses)]*len(valid_losses)

In [73]:
history = {'epochs':epochs,'train_losses':train_losses,'valid_losses':valid_losses,'train_times':train_times}

In [74]:
df_history = pd.DataFrame(history)

In [76]:
df_history.to_csv('logs.csv')

In [89]:
!ls 

config.json		optimizer.pt	   rng_state.pth  trainer_state.json
generation_config.json	pytorch_model.bin  scheduler.pt   training_args.bin


In [11]:
from transformers import AutoTokenizer,AutoModel,BertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained(f'/home/mohammad/Tokenizers/tokenizers/custom_bert_tokenizer')

In [12]:
model1 = BertForMaskedLM.from_pretrained('../ali_bert/checkpoint-932')

In [13]:
examples = "club nacional de [MASK] miho conoci como nacional ta club mas grandi di futbol di montevidéu uruguay fundá dia 14 di mei 1899 club ta resultado di union entre uruguay athletic montevideo football club uruguay athletic tabata un club di bario la union cual no mester wordo confundi cu uruguay athletic club cu tabata hunga den prome division actualmente nacional ta hunga den liga profesional mas halto na uruguay algun futbolista ku tabata hunga pa nacional ta luis suarez uruguay sebastian abreu uruguay atilio garcia argentina hugo de león uruguay nicolás lodeiro uruguay héctor scarone uruguay julio cesar dely valdéz panama fernando muslera uruguay titulos campeon nashonal liga profesional di uruguay 45 1902 1903 1912 1915 1916 1917 1919 1920"

In [14]:
from transformers import pipeline

In [15]:
p1=pipeline('fill-mask', model=model1, tokenizer=tokenizer,device=model1.device)

In [16]:
p1.predict(examples)

[{'score': 0.07844258844852448,
  'token': 602,
  'token_str': 'di',
  'sequence': 'club nacional de di miho conoci como nacional ta club mas grandi di futbol di montevideu uruguay funda dia 14 di mei 1899 club ta resultado di union entre uruguay athletic montevideo football club uruguay athletic tabata un club di bario la union cual no mester wordo confundi cu uruguay athletic club cu tabata hunga den prome division actualmente nacional ta hunga den liga profesional mas halto na uruguay algun futbolista ku tabata hunga pa nacional ta luis suarez uruguay sebastian abreu uruguay atilio garcia argentina hugo de leon uruguay nicolas lodeiro uruguay hector scarone uruguay julio cesar dely valdez panama fernando muslera uruguay titulos campeon nashonal liga profesional di uruguay 45 1902 1903 1912 1915 1916 1917 1919 1920'},
 {'score': 0.046052489429712296,
  'token': 607,
  'token_str': 'ku',
  'sequence': 'club nacional de ku miho conoci como nacional ta club mas grandi di futbol di monte