In [1]:
data = []

from json import loads as load_json

with open("../data.json", 'r') as raw:
    data = load_json(raw.read())

In [2]:
from datasets import Dataset

ds = Dataset.from_list(data).shuffle(seed = 42) \
                            .train_test_split(test_size = 0.2)

In [3]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments, IntervalStrategy, EarlyStoppingCallback

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

bin c:\Users\ms2k\.conda\envs\ml\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7601
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1901
    })
})

In [5]:
def tokenize(data):
    return tokenizer(data['text'], padding = 'max_length', truncation = True)

train_ds = ds['train'].map(tokenize, batched=True)
test_ds  = ds['test'] .map(tokenize, batched=True)

Map:   0%|          | 0/7601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1901 [00:00<?, ? examples/s]

In [6]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to('cuda')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [8]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
def save_best_model(model, output_dir, metrics):
    if metrics['eval_loss'] < save_best_model.best_eval_loss:
        save_best_model.best_eval_loss = metrics['eval_loss']
        model.save_pretrained(output_dir)

save_best_model.best_eval_loss = float('inf')

In [10]:
training_args = TrainingArguments(
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 8,
    weight_decay = 0.01,
    warmup_steps = 0.1,
    gradient_accumulation_steps = 1,
    adam_epsilon = 1e-8,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    save_total_limit = 1,
    output_dir = './output/bert',
    load_best_model_at_end = True
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = 3, early_stopping_threshold = 0.01)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = test_ds,
    tokenizer = tokenizer,
    data_collator = data_collator,
    callbacks = [ early_stopping_callback ]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
trainer.train()

  0%|          | 0/3808 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.587614893913269, 'eval_runtime': 47.4247, 'eval_samples_per_second': 40.085, 'eval_steps_per_second': 2.509, 'epoch': 1.0}
{'loss': 0.6354, 'learning_rate': 1.737440584048951e-05, 'epoch': 1.05}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6067901253700256, 'eval_runtime': 47.4611, 'eval_samples_per_second': 40.054, 'eval_steps_per_second': 2.507, 'epoch': 2.0}
{'loss': 0.5661, 'learning_rate': 1.474828645710234e-05, 'epoch': 2.1}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5718623399734497, 'eval_runtime': 47.4421, 'eval_samples_per_second': 40.07, 'eval_steps_per_second': 2.508, 'epoch': 3.0}
{'loss': 0.5394, 'learning_rate': 1.2122167073715172e-05, 'epoch': 3.15}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5407180786132812, 'eval_runtime': 47.4003, 'eval_samples_per_second': 40.105, 'eval_steps_per_second': 2.511, 'epoch': 4.0}
{'loss': 0.5004, 'learning_rate': 9.496047690328004e-06, 'epoch': 4.2}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5444613099098206, 'eval_runtime': 47.3388, 'eval_samples_per_second': 40.157, 'eval_steps_per_second': 2.514, 'epoch': 5.0}
{'loss': 0.4655, 'learning_rate': 6.869928306940834e-06, 'epoch': 5.25}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6131202578544617, 'eval_runtime': 47.3893, 'eval_samples_per_second': 40.115, 'eval_steps_per_second': 2.511, 'epoch': 6.0}
{'loss': 0.4427, 'learning_rate': 4.243808923553665e-06, 'epoch': 6.3}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6043345928192139, 'eval_runtime': 47.3616, 'eval_samples_per_second': 40.138, 'eval_steps_per_second': 2.513, 'epoch': 7.0}
{'train_runtime': 19597.0319, 'train_samples_per_second': 3.103, 'train_steps_per_second': 0.194, 'train_loss': 0.5140376554674604, 'epoch': 7.0}


TrainOutput(global_step=3332, training_loss=0.5140376554674604, metrics={'train_runtime': 19597.0319, 'train_samples_per_second': 3.103, 'train_steps_per_second': 0.194, 'train_loss': 0.5140376554674604, 'epoch': 7.0})

In [12]:
metrics = trainer.evaluate()
save_best_model(model, training_args.output_dir, metrics)

  0%|          | 0/119 [00:00<?, ?it/s]

In [14]:
trainer.evaluate()

  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5407180786132812,
 'eval_runtime': 51.586,
 'eval_samples_per_second': 36.851,
 'eval_steps_per_second': 2.307,
 'epoch': 7.0}