In [1]:
tokenizer_id="bert-base-waspak-2023-papia"

In [2]:
!huggingface-cli whoami

m-aliabbas1


In [3]:
from huggingface_hub import HfApi

user_id = HfApi().whoami()["name"]

In [4]:
from transformers import AutoTokenizer
import multiprocessing


tokenizer = AutoTokenizer.from_pretrained(f'{user_id}/{tokenizer_id}')
num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")


The max length for the tokenizer is: 512


In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('text_col.csv')

In [7]:
text_list = list(df['text'].values)

In [8]:
text_list = [str(text) for text in text_list]

In [9]:
raw_data = {'text':text_list}

In [10]:
num_proc = multiprocessing.cpu_count()

In [11]:
def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples['text'], return_special_tokens_mask=True, truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

In [12]:
from datasets import Dataset

In [13]:
dataset = Dataset.from_dict(raw_data)

In [14]:
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)

Map (num_proc=24):   0%|          | 0/4655 [00:00<?, ? examples/s]

In [15]:
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

In [16]:
from transformers import Trainer, TrainingArguments

In [17]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [18]:
vocab_size = 30522
max_length = 512

In [19]:
model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)

Generate config GenerationConfig {
  "_from_model_config": true,
  "pad_token_id": 0,
  "transformers_version": "4.31.0"
}



In [20]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [21]:
model_path = 'models/'

In [22]:
training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=10, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=64,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

using `logging_steps` to initialize `eval_steps` to 1000
Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets
)

In [26]:
import torch

# Check if CUDA is available and print status
print("Cuda available: ", torch.cuda.is_available())

# Get the current device and print it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Current device: ", device)

Cuda available:  True
Current device:  cuda


In [27]:
# train the model
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask. If special_tokens_mask are not expected by `BertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4,655
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Training with DataParallel so batch size has been adjusted to: 20
  Total train batch size (w. parallel, distributed & accumulation) = 160
  Gradient Accumulation steps = 8
  Total optimization steps = 290
  Number of trainable parameters = 109,514,298


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=290, training_loss=6.778239493534483, metrics={'train_runtime': 1814.8243, 'train_samples_per_second': 25.65, 'train_steps_per_second': 0.16, 'total_flos': 1.3243150156032e+16, 'train_loss': 6.778239493534483, 'epoch': 9.96})

In [54]:
text_list[0]

'club nacional de football miho conoci como nacional ta club mas grandi di futbol di montevidéu uruguay fundá dia 14 di mei 1899 club ta resultado di union entre uruguay athletic montevideo football club uruguay athletic tabata un club di bario la union cual no mester wordo confundi cu uruguay athletic club cu tabata hunga den prome division actualmente nacional ta hunga den liga profesional mas halto na uruguay algun futbolista ku tabata hunga pa nacional ta luis suarez uruguay sebastian abreu uruguay atilio garcia argentina hugo de león uruguay nicolás lodeiro uruguay héctor scarone uruguay julio cesar dely valdéz panama fernando muslera uruguay titulos campeon nashonal liga profesional di uruguay 45 1902 1903 1912 1915 1916 1917 1919 1920 1922 1923 1924 1933 1934 1939 1940 1941 1942 1943 1946 1947 1950 1952 1955 1956 1957 1963 1966 1969 1970 1971 1972 1977 1980 1983 1992 1998 2000 2001 2002 2005 2005 06 2008 09 2010 11 2011 12 2014 15 2016 campeon kontinental sur amerika cup 3 1971 

In [50]:
# or simply use pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer,device=model.device)

In [57]:
example = "'club nacional de football miho [MASK] como nndá dia 14 dy"
for prediction in fill_mask(example):
  print(prediction)

{'score': 0.12758685648441315, 'token': 602, 'token_str': 'di', 'sequence': 'club nacional de football miho di como nnda dia 14 dy'}
{'score': 0.04371142014861107, 'token': 606, 'token_str': 'ta', 'sequence': 'club nacional de football miho ta como nnda dia 14 dy'}
{'score': 0.03310277685523033, 'token': 620, 'token_str': 'un', 'sequence': 'club nacional de football miho un como nnda dia 14 dy'}
{'score': 0.018927378579974174, 'token': 628, 'token_str': 'den', 'sequence': 'club nacional de football miho den como nnda dia 14 dy'}
{'score': 0.01463568489998579, 'token': 625, 'token_str': 'na', 'sequence': 'club nacional de football miho na como nnda dia 14 dy'}


In [42]:
!nvidia-smi

Mon Jul 24 17:05:18 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090         Off| 00000000:01:00.0 Off |                  N/A |
|  0%   45C    P8               25W / 370W|  20745MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090         Off| 00000000:08:0