In [1]:
data = []

from json import loads as load_json

with open("../data.json", 'r') as raw:
    data = load_json(raw.read())

In [2]:
from datasets import Dataset

ds = Dataset.from_list(data).shuffle(seed = 42) \
                            .train_test_split(test_size = 0.2)

In [3]:
from transformers import DistilBertTokenizerFast, Trainer, TrainingArguments, IntervalStrategy, EarlyStoppingCallback

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased', num_labels=2)

bin c:\Users\ms2k\.conda\envs\ml\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7601
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1901
    })
})

In [5]:
def tokenize(data):
    tokens = tokenizer(data['text'], padding = 'max_length', truncation = True, return_tensors='pt')
    tokens = {key: tensor.to('cuda') for key, tensor in tokens.items()}
    return tokens

train_ds = ds['train'].map(tokenize, batched=True)
test_ds  = ds['test'] .map(tokenize, batched=True)

Map:   0%|          | 0/7601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1901 [00:00<?, ? examples/s]

In [6]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2).to('cuda')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [8]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [9]:
def save_best_model(model, output_dir, metrics):
    if metrics['eval_loss'] < save_best_model.best_eval_loss:
        save_best_model.best_eval_loss = metrics['eval_loss']
        model.save_pretrained(output_dir)

save_best_model.best_eval_loss = float('inf')

In [10]:
training_args = TrainingArguments(
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 8,
    weight_decay = 0.01,
    warmup_steps = 0.1,
    gradient_accumulation_steps = 1,
    adam_epsilon = 1e-8,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    save_total_limit = 1,
    output_dir = './output/distilbert2',
    load_best_model_at_end = True
)

early_stopping_callback = EarlyStoppingCallback(early_stopping_patience = 3, early_stopping_threshold = 0.01)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds,
    eval_dataset = test_ds,
    tokenizer = tokenizer,
    data_collator = data_collator,
    callbacks = [ early_stopping_callback ]
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [11]:
trainer.train()

  0%|          | 0/3808 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.6079826354980469, 'eval_runtime': 16.0347, 'eval_samples_per_second': 118.555, 'eval_steps_per_second': 7.421, 'epoch': 1.0}
{'loss': 0.6312, 'learning_rate': 1.737440584048951e-05, 'epoch': 1.05}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.596157968044281, 'eval_runtime': 16.0647, 'eval_samples_per_second': 118.334, 'eval_steps_per_second': 7.408, 'epoch': 2.0}
{'loss': 0.5879, 'learning_rate': 1.474828645710234e-05, 'epoch': 2.1}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5523921251296997, 'eval_runtime': 16.0671, 'eval_samples_per_second': 118.317, 'eval_steps_per_second': 7.406, 'epoch': 3.0}
{'loss': 0.5535, 'learning_rate': 1.2122167073715172e-05, 'epoch': 3.15}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5483282208442688, 'eval_runtime': 16.3188, 'eval_samples_per_second': 116.491, 'eval_steps_per_second': 7.292, 'epoch': 4.0}
{'loss': 0.522, 'learning_rate': 9.496047690328004e-06, 'epoch': 4.2}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5598933100700378, 'eval_runtime': 16.0843, 'eval_samples_per_second': 118.19, 'eval_steps_per_second': 7.399, 'epoch': 5.0}
{'loss': 0.4822, 'learning_rate': 6.869928306940834e-06, 'epoch': 5.25}


  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5691394209861755, 'eval_runtime': 16.0649, 'eval_samples_per_second': 118.333, 'eval_steps_per_second': 7.407, 'epoch': 6.0}
{'train_runtime': 1214.9652, 'train_samples_per_second': 50.049, 'train_steps_per_second': 3.134, 'train_loss': 0.5441846072840757, 'epoch': 6.0}


TrainOutput(global_step=2856, training_loss=0.5441846072840757, metrics={'train_runtime': 1214.9652, 'train_samples_per_second': 50.049, 'train_steps_per_second': 3.134, 'train_loss': 0.5441846072840757, 'epoch': 6.0})

In [12]:
metrics = trainer.evaluate()
save_best_model(model, training_args.output_dir, metrics)

  0%|          | 0/119 [00:00<?, ?it/s]

In [13]:
trainer.evaluate()

  0%|          | 0/119 [00:00<?, ?it/s]

{'eval_loss': 0.5483282208442688,
 'eval_runtime': 15.8955,
 'eval_samples_per_second': 119.593,
 'eval_steps_per_second': 7.486,
 'epoch': 6.0}