In [15]:
data = []

from json import loads as load_json

with open("../data.json", 'r') as raw:
    data = load_json(raw.read())

In [16]:
from datasets import Dataset

ds = Dataset.from_list(data).shuffle(seed = 42) \
                            .train_test_split(test_size = 0.2)

In [17]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [18]:
ds

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 7601
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1901
    })
})

In [19]:
import torch

def tokenize(data):
    tokenized_data = tokenizer(data['text'], padding='max_length', truncation=True, return_tensors='pt')
    tokenized_data = {key: tensor.to('cuda') for key, tensor in tokenized_data.items()}
    return tokenized_data

train_ds = ds['train'].map(tokenize, batched=True)
test_ds  = ds['test'] .map(tokenize, batched=True)

Map:   0%|          | 0/7601 [00:00<?, ? examples/s]

Map:   0%|          | 0/1901 [00:00<?, ? examples/s]

In [20]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 2).to('cuda')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
import os
os.environ['WANDB_DISABLED'] = 'true'

In [22]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    output_dir='./output/distilbert',
    num_train_epochs=3,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_ds
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [23]:
trainer.train()

  0%|          | 0/2853 [00:00<?, ?it/s]

{'loss': 0.6999, 'learning_rate': 4.12372940764108e-05, 'epoch': 0.53}
{'loss': 0.697, 'learning_rate': 3.247458815282159e-05, 'epoch': 1.05}
{'loss': 0.6949, 'learning_rate': 2.3711882229232387e-05, 'epoch': 1.58}
{'loss': 0.6946, 'learning_rate': 1.4949176305643184e-05, 'epoch': 2.1}
{'loss': 0.6742, 'learning_rate': 6.186470382053978e-06, 'epoch': 2.63}
{'train_runtime': 583.2408, 'train_samples_per_second': 39.097, 'train_steps_per_second': 4.892, 'train_loss': 0.6836356113887109, 'epoch': 3.0}


TrainOutput(global_step=2853, training_loss=0.6836356113887109, metrics={'train_runtime': 583.2408, 'train_samples_per_second': 39.097, 'train_steps_per_second': 4.892, 'train_loss': 0.6836356113887109, 'epoch': 3.0})

In [24]:
results = trainer.evaluate(test_ds)
print(results)

  0%|          | 0/238 [00:00<?, ?it/s]

{'eval_loss': 0.6007010340690613, 'eval_runtime': 16.5958, 'eval_samples_per_second': 114.547, 'eval_steps_per_second': 14.341, 'epoch': 3.0}
