In [1]:
import logging

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("log.txt"),
        logging.StreamHandler()
    ]
)

In [54]:
tokenizer.vocab_size

50257

In [2]:
from transformers import *
import multiprocessing


tokenizer = GPT2TokenizerFast.from_pretrained("./gpt2_tokenizer_papia")
num_proc = multiprocessing.cpu_count()
print(f"The max length for the tokenizer is: {tokenizer.model_max_length}")


2023-08-06 07:59:13,083 [INFO] Created a temporary directory at /tmp/tmpwrc0o52g
2023-08-06 07:59:13,083 [INFO] Writing /tmp/tmpwrc0o52g/_remote_module_non_scriptable.py
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


The max length for the tokenizer is: 1024


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('text_col.csv')

In [5]:
text_list = list(df['text'].values)

In [6]:
text_list = [str(text) for text in text_list]

In [7]:
raw_data = {'text':text_list}

In [8]:
num_proc = multiprocessing.cpu_count()

In [9]:
def group_texts(examples):
    tokenized_inputs = tokenizer(
       examples['text'], truncation=True, max_length=tokenizer.model_max_length
    )
    return tokenized_inputs

In [10]:
from datasets import Dataset

In [11]:
dataset = Dataset.from_dict(raw_data)

In [12]:
dataset = dataset.train_test_split(
                        test_size=0.2)

In [13]:
tokenized_datasets = dataset.map(group_texts, batched=True, remove_columns=["text"], num_proc=num_proc)

Map (num_proc=24):   0%|          | 0/3724 [00:00<?, ? examples/s]

Map (num_proc=24):   0%|          | 0/931 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets = tokenized_datasets.shuffle(seed=34)

In [15]:
from transformers import Trainer, TrainingArguments

In [16]:
from datasets import *
from transformers import *
from tokenizers import *
import os
import json

In [17]:
vocab_size = tokenizer.vocab_size
max_length = 1024

In [18]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

In [19]:
tokenizer.pad_token = tokenizer.eos_token

In [20]:
model = AutoModelForCausalLM.from_pretrained("gpt2")

loading configuration file config.json from cache at /home/mohammad/.cache/huggingface/hub/models--gpt2/snapshots/11c5a3d5811f50298f278a704980280950aedb10/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transfor

In [21]:
model_path = 'models/'

In [22]:
training_args = TrainingArguments(
    "test_gpt_trainer",
    evaluation_strategy="epoch",  # to evaluate model and get metrics after each epoch
    logging_strategy="epoch",  # to log metrics after each epoch
    save_strategy="epoch",  # to save model after each epoch
    per_device_train_batch_size=4,
    learning_rate=2e-2,
    num_train_epochs=1,   
    logging_dir='./logs', 
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [23]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 3724
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 931
    })
})

In [24]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
)

In [25]:
trainer.train()

***** Running training *****
  Num examples = 3,724
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Training with DataParallel so batch size has been adjusted to: 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 466
  Number of trainable parameters = 124,439,808
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,7.7571,6.892411


***** Running Evaluation *****
  Num examples = 931
  Batch size = 16
Saving model checkpoint to test_gpt_trainer/checkpoint-466
Configuration saved in test_gpt_trainer/checkpoint-466/config.json
Configuration saved in test_gpt_trainer/checkpoint-466/generation_config.json
Model weights saved in test_gpt_trainer/checkpoint-466/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=466, training_loss=7.757086888914968, metrics={'train_runtime': 577.0809, 'train_samples_per_second': 6.453, 'train_steps_per_second': 0.808, 'total_flos': 1946103054336000.0, 'train_loss': 7.757086888914968, 'epoch': 1.0})

In [33]:
training_history=trainer.state.log_history

In [27]:
valid_losses = []
train_losses = []
train_time = 0.0
epochs = []
lr = []
for history_dict in training_history:
    
    try:
        if 'eval_loss' in history_dict.keys():
            valid_loss = history_dict['eval_loss']
            valid_losses.append(valid_loss)
        elif 'loss' in history_dict.keys():
            train_loss = history_dict['loss']
            epochs.append(history_dict['epoch'])
            train_losses.append(train_loss)
            lr.append(history_dict['learning_rate'])
        elif 'train_runtime' in history_dict.keys():
            train_time = history_dict['train_runtime']
    except Exception as e:
        print(f'Something error {e}')

In [28]:
valid_losses,train_losses,train_time,epochs,lr

([6.892411231994629], [7.7571], 577.0809, [1.0], [0.0])

In [29]:
train_times = [train_time/len(valid_losses)]*len(valid_losses)

In [30]:
history = {'epochs':epochs,'train_losses':train_losses,'valid_losses':valid_losses,'train_times':train_times}

In [31]:
df_history = pd.DataFrame(history)

In [32]:
df_history.to_csv('logs.csv')

In [34]:
!ls 

exp_bert_pretraining.ipynb	   test_text_file.txt
exp_data_cleaning_and_eda.ipynb    test_trainer
exp_openai_gpt2_pretraining.ipynb  text_col.csv
exp_roberta.ipynb		   text_file.txt
exp_tiny_bert_tokenizer.ipynb	   Tokenizer-GPT2.ipynb
gpt2_tokenizer_papia		   Tokenizer.ipynb
log				   tokenizer_roberta_papia
logs.csv			   tokenizer_tiny_bert_papia
log.txt				   Untitled1.ipynb
models				   Untitled.ipynb
ouput				   WaspakBERTo
results				   WaspakBERTo1
test_gpt_trainer		   Waspak_papi_BERTo


In [35]:
from transformers import AutoTokenizer,AutoModel,BertForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("./gpt2_tokenizer_papia")

loading file vocab.json
loading file merges.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [37]:
!ls 

checkpoint-466


In [51]:
# model1 = BertForMaskedLM.from_pretrained()
p1 = pipeline("text-generation",model='./test_gpt_trainer/checkpoint-466',tokenizer='./gpt2_tokenizer_papia')

loading configuration file ./test_gpt_trainer/checkpoint-466/config.json
Model config GPT2Config {
  "_name_or_path": "./test_gpt_trainer/checkpoint-466",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.31.0",
  "use_cache

In [52]:
examples = "club nacional de football "

In [48]:
from transformers import pipeline

In [53]:
p1.predict(examples)

Generate config GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 50256,
  "do_sample": true,
  "eos_token_id": 50256,
  "max_length": 50,
  "transformers_version": "4.31.0"
}

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'club nacional de football  un na ku den di di por tin di ku e na di di di riba parti ta di un poblashon di ku di ku esaki e bai pais ku e tin ku den ku ta e na otro grandi komo ku ta cu ku'}]