In [None]:
!pip install -qq transformers datasets accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import transformers

transformers.__version__

'4.35.2'

In [None]:
import torch
import numpy as np

In [None]:
# load dataset
from datasets import load_dataset

In [None]:
D = load_dataset('mnist')

In [None]:
# data configuration
D

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
})

In [None]:
# mini-batch format
D['train'][0:2]

{'image': [<PIL.PngImagePlugin.PngImageFile image mode=L size=28x28>,
  <PIL.PngImagePlugin.PngImageFile image mode=L size=28x28>],
 'label': [5, 0]}

In [None]:
# convert PIL to numpy 1d array
def preprocess(batch):
    return {
        'input_ids': [np.array(img).flatten()/255. for img in batch['image']],
    }

In [None]:
D = D.map(preprocess, batched=True)

In [None]:
# # containing input_ids
D

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'input_ids'],
        num_rows: 60000
    })
    test: Dataset({
        features: ['image', 'label', 'input_ids'],
        num_rows: 10000
    })
})

In [None]:
# set format for pytorch
D.set_format('torch', columns=['input_ids', 'label'])

In [None]:
# image is not included
D['test'][0:2]

{'label': tensor([7, 2]),
 'input_ids': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]])}

In [None]:
# model output class
# https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/output#transformers.utils.ModelOutput
from transformers.modeling_outputs import TokenClassifierOutput


TokenClassifierOutput(logits=torch.tensor([[[0.,1.,2.]],[[1.,2.,3.]]]))

TokenClassifierOutput(loss=None, logits=tensor([[[0., 1., 2.]],

        [[1., 2., 3.]]]), hidden_states=None, attentions=None)

In [None]:
class SimpleLinearModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = torch.nn.Linear(784, 10)

    def forward(self, input_ids, **kwargs):
        x = self.linear(input_ids) #(N, 10)
        x = x[:,None,:]

        output = TokenClassifierOutput(logits=x)

        return output


In [None]:
# model forward test
model = SimpleLinearModel()
outputs = model(D['train'][0:2]['input_ids'])
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 4.2112e-02, -6.7938e-02, -1.8749e-01,  2.3568e-01, -8.8057e-02,
          -3.2648e-01, -1.0699e-01, -1.6436e-01,  2.1771e-01, -2.0494e-01]],

        [[ 2.7809e-01,  1.7507e-02,  9.8914e-02,  1.7241e-01, -6.8430e-02,
           8.5607e-03, -2.5012e-04, -1.0976e-01,  1.3989e-02, -7.2853e-02]]],
       grad_fn=<SliceBackward0>), hidden_states=None, attentions=None)

In [None]:
# Trainer
# https://huggingface.co/docs/transformers/quicktour#trainer---a-pytorch-optimized-training-loop

# TrainingArguments
# https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer#transformers.TrainingArguments
from transformers import TrainingArguments

batch_size = 32
logging_steps = len(D['train']) // batch_size
training_args = TrainingArguments(
                    output_dir='training',
                    num_train_epochs=3,
                    learning_rate=0.001,
                    per_device_train_batch_size=batch_size,
                    per_device_eval_batch_size=batch_size,
                    weight_decay=0.01,
                    evaluation_strategy = 'epoch',
                    # Need to set a name that matches the label variable name
                    # in the minibatch returned by Trainer's data loader
                    # Unless it's a custom model, it seems to be handled internally.
                    # Or maybe there's a part of the model that doesn't do this and handles it...??
                    label_names = ['labels']
                )

In [None]:
# Trainer
# https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer#transformers.Trainer
from transformers import Trainer


In [None]:
import torch.nn.functional as F

# Subclassing Trainer to override compute_loss
# https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/trainer
class MyTrainer(Trainer):
    def __init__(self, loss_function, **kwargs):
        super().__init__(**kwargs)
        self.loss_function = loss_function

    # https://github.com/huggingface/transformers/blob/v4.36.1/src/transformers/trainer.py#L2741
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_func = torch.nn.NLLLoss()
        loss = loss_func(F.log_softmax(outputs.logits.squeeze(), dim=1), labels)
        return (loss, outputs) if return_outputs else loss


In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(pred.label_ids, preds)
    return {'accuracy': acc}

In [None]:
trainer = MyTrainer(
    loss_function = torch.nn.NLLLoss(),
    model=model, args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=D['train'],
    eval_dataset=D['test']
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2648,0.265214,0.9269
2,0.253,0.265317,0.9256
3,0.2578,0.264443,0.9262
4,0.2548,0.267734,0.9239
5,0.2483,0.263641,0.9268
6,0.2516,0.260789,0.9279
7,0.2456,0.260697,0.9281
8,0.2405,0.261282,0.9269
9,0.2404,0.260177,0.9277
10,0.238,0.26058,0.9271


TrainOutput(global_step=18750, training_loss=0.24996466349283855, metrics={'train_runtime': 104.2104, 'train_samples_per_second': 5757.584, 'train_steps_per_second': 179.925, 'total_flos': 0.0, 'train_loss': 0.24996466349283855, 'epoch': 10.0})

In [None]:
preds_output = trainer.predict(D['test'])

In [None]:
preds_output.metrics

{'test_loss': 0.2605796456336975,
 'test_accuracy': 0.9271,
 'test_runtime': 1.881,
 'test_samples_per_second': 5316.388,
 'test_steps_per_second': 166.403}

In [None]:
y_preds = np.argmax(preds_output.predictions, axis=2).squeeze()