In [1]:
data = []

from json import loads as load_json

with open("../data.json", 'r') as raw:
    data = load_json(raw.read())

In [2]:
from datasets import Dataset

ds = Dataset.from_list(data).shuffle(seed = 42) \
                            .train_test_split(test_size = 0.2)

In [3]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

bin c:\Users\ms2k\.conda\envs\ml\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7601
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1901
    })
})

In [5]:
def tokenize(data):
    tokens = tokenizer(data['text'], padding='max_length', truncation=True, return_tensors='pt')
    tokens = {key: tensor.to('cuda') for key, tensor in tokens.items()}
    return tokens

train_ds = ds['train'].map(tokenize, batched=True)
test_ds  = ds['test'] .map(tokenize, batched=True)

Map:   0%|          | 0/7601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1901 [00:00<?, ? examples/s]

In [6]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
training_args = TrainingArguments(
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 3,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    output_dir = './output/bert1',
    load_best_model_at_end = True
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = 3, early_stopping_threshold = 0.01)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = test_ds,
    tokenizer = tokenizer,
    data_collator = data_collator,
    callbacks = [ early_stopping_callback ]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [10]:
trainer.train()

  0%|          | 0/1428 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6973721981048584, 'eval_runtime': 69.5127, 'eval_samples_per_second': 27.348, 'eval_steps_per_second': 1.712, 'epoch': 1.0}
{'loss': 0.7037, 'learning_rate': 3.2492997198879555e-05, 'epoch': 1.05}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6940053701400757, 'eval_runtime': 62.0003, 'eval_samples_per_second': 30.661, 'eval_steps_per_second': 1.919, 'epoch': 2.0}
{'loss': 0.7011, 'learning_rate': 1.4985994397759103e-05, 'epoch': 2.1}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6931052803993225, 'eval_runtime': 61.2481, 'eval_samples_per_second': 31.038, 'eval_steps_per_second': 1.943, 'epoch': 3.0}
{'train_runtime': 11211.8193, 'train_samples_per_second': 2.034, 'train_steps_per_second': 0.127, 'train_loss': 0.7014518972872352, 'epoch': 3.0}


TrainOutput(global_step=1428, training_loss=0.7014518972872352, metrics={'train_runtime': 11211.8193, 'train_samples_per_second': 2.034, 'train_steps_per_second': 0.127, 'train_loss': 0.7014518972872352, 'epoch': 3.0})

In [12]:
def save_best_model(model, output_dir, metrics):
    if metrics['eval_loss'] < save_best_model.best_eval_loss:
        save_best_model.best_eval_loss = metrics['eval_loss']
        model.save_pretrained(output_dir)

save_best_model.best_eval_loss = float('inf')

In [13]:
metrics = trainer.evaluate()
save_best_model(model, training_args.output_dir, metrics)

  0%|          | 0/119 [00:00<?, ?it/s]

In [14]:
trainer.evaluate()

  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6931052803993225,
 'eval_runtime': 61.5282,
 'eval_samples_per_second': 30.896,
 'eval_steps_per_second': 1.934,
 'epoch': 3.0}