# 1. 

# 2. Imports

```Python
#!pip uninstall mp2024pkg -y
#!pip install git+https://github.com/guebin/mp2024pkg.git
```

In [2]:
import os
os.environ["WANDB_MODE"] = "offline"

In [4]:
import pandas as pd
import numpy as np
import datasets 
import transformers
import torch
import torchvision
import torch.utils
import evaluate
from rich import print as rprint
from mp2024pkg import show, tab

# 3. `data_collator` 이해

## A. 외우세요 $(\star\star\star)$

`-` `data_collator`를 잘 설계하는 방법: `trainer_input`과 `model`이 주어졌을때 `data_collator`는 아래의 코드가 동작하도록 설계하면 된다. 

```Python
trainer_input = ~~~
model = ~~~~ 
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x
) # 이 과정에서 model이 cuda로 감 
_batched_data = batch_maker.get_test_dataloader(trainer_input) # 이 과정에서 trainer_input이 cuda로 감
batched_data = list(_batched_data)
single_batch = batched_data[0]
model.to("cpu") # 경우에 따라 생략해야할수도있음
model(**data_collator(single_batch))
````

`-` 위의 코드가 오류없이 실행되었다면 아래의 코드를 사용할 수 있다.

```Python
trainer = transformers.Trainer(
    model = model,
    data_collator = data_collator
)
trainer.predict(trainer_input)
```

> 이걸 어떻게 알았냐고요? 코드뜯어봤습니다.. $\to$ 숙제

:::{.callout-important}
코랩사용자의 경우 아래와 같이 wandb(Weights & Biases) 로그인을 요구하는 문제가 있습니다. 
```bash
wandb: WARNING The `run_name` is currently set to the same value as `TrainingArguments.output_dir`. If this was not intended, please specify a different run name by setting the `TrainingArguments.run_name` parameter.
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:
```
이를 해결하기 위해서는 아래의 코드를 코랩처음에 실행하면 됩니다. 

```Python
import os
os.environ["WANDB_MODE"] = "offline"
```
:::

:::{.callout-note}

주의: `data`의 type이 꼭 `Dataset` 일 필요는 없다..
:::

## B. IMDB -- 복습

ref: <https://huggingface.co/docs/transformers/tasks/sequence_classification>

*1. 데이터준비: `"guebin/imdb-tiny"` $\to$ `trainer_input`*

In [5]:
imdb = datasets.load_dataset("guebin/imdb-tiny")
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") 
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_imdb = imdb.map(preprocess_function,batched=True)
trainer_input = tokenized_imdb['train']

*2. 모델준비: `"distilbert/distilbert-base-uncased"` $\to$`model`*

In [7]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*3. 데이터콜렉터: `DataCollatorWithPadding()` $\to$ `data_collator`*

In [8]:
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tenso

---

데이터콜렉터가 올바로 설정되었는지 체크하고, 적당한 `trainer`를 만들어 

```Python
trainer.predict(trainer_input)
```

이 정상동작하는지 확인하라. 

`(풀이)`

In [9]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x,
) # 이 과정에서 model이 cuda로 감 
_batched_data = batch_maker.get_test_dataloader(trainer_input) # 이 과정에서 trainer_input이 cuda로 감
batched_data = list(_batched_data)
single_batch = batched_data[0]
model.to("cpu") # 경우에 따라 생략해야할수도있음
model(**data_collator(single_batch))

SequenceClassifierOutput(loss=tensor(0.6714, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0168, -0.0577],
        [ 0.0035, -0.0523],
        [-0.0044, -0.0638],
        [ 0.0191, -0.0579],
        [ 0.0050, -0.0271],
        [ 0.0182, -0.0258],
        [ 0.0243, -0.0059],
        [ 0.0056, -0.0080]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

- 잘 돌아감..
- 잘 설계된 data_collator라는 의미
- 이걸 이용해서 진짜 trainer를 만들자.. 

In [10]:
trainer = transformers.Trainer(
    model = model,
    data_collator = data_collator
)
out = trainer.predict(trainer_input)
out 

PredictionOutput(predictions=array([[-0.01682485, -0.05772787],
       [ 0.00350759, -0.05234354],
       [-0.0044021 , -0.06378944],
       [ 0.01906627, -0.05794427],
       [ 0.00501909, -0.02714018],
       [ 0.0182183 , -0.02577444],
       [ 0.02433074, -0.00588998],
       [ 0.00561093, -0.00798136],
       [ 0.015525  , -0.01578641],
       [ 0.03086514, -0.00874608]], dtype=float32), label_ids=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), metrics={'test_loss': 0.6722059845924377, 'test_model_preparation_time': 0.0015, 'test_runtime': 0.2214, 'test_samples_per_second': 45.168, 'test_steps_per_second': 9.034})

`#`

`-`  관찰1: 여기에서 `batched_data[-1]`은 하나의 배치를 의미, 그런데 모델의 입력으로 사용하기에는 형식이 맞지 않음

In [23]:
#batched_data[-1] -- 안될것같은 형식

In [24]:
model.to("cpu") # 경우에 따라 생략해야할수도있음
model(**batched_data[-1])

TypeError: DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
  )
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) argument after ** must be a mapping, not list

`-` 관찰2: 여기에서 `data_collator(batched_data[-1])` 역시 하나의 배치를 의미. 이번에는 형식이 잘 맞음. 

In [27]:
#data_collator(batched_data[-1]) -- 될 것 같은 형식

In [26]:
model.to("cpu") # 경우에 따라 생략해야할수도있음
model(**data_collator(batched_data[-1]))

SequenceClassifierOutput(loss=tensor(0.6756, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0155, -0.0158],
        [ 0.0309, -0.0087]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

:::{.callout-note}
### `data_collator` -- 심화이해

아래의 형식으로 정리된 배치화된 자료가 있다고 하자. (주의: `batched_data`는 항상 list비슷한 오브젝트이어야함)

```Python
batched_data = [batch_1, batch_2, ...,batch_n]
```

`data_collator` 는 각각의 `single_batch`, 즉 `batch_1`, `batch_2` 등을 `model`이 처리가능한 형태로 "형식"을 맞춰주는 역할을 한다. 즉 아래가 실행되도록 만들어주는 역할을 한다. 

```Python
model(**data_collator(batch_1))
```
:::

:::{.callout-note}
### `trainer`와 `model`의 자료처리과정 비교

***#. `model`의 자료처리과정*** 

-코드: `model.forward(model_input)`

-처리과정: `model_input`에 정리된 입력을 단순히 `model.forward()` 함수가 처리. 

***#. `trainer`의 자료처리과정***

-코드: `trainer.predict(trainer_input)`

-처리과정: 크게 배치화 $\to$ 데이터콜렉팅 $\to$ 추론 의 과정을 거친다. 

1. `trainer_input`을 배치(batch)로 나눈다.
2.	각 배치(=`single_batch`)를 `data_collator`를 통해 형식을 맞춘다. 
3.	형식이 조정된 데이터를 `model.forward`의 입력으로 전달한다. 

-슈도코드:
```Python
## 이 코드는.. 
trainer.predict(trainer_input)

## 대략 아래의 느낌으로 해석하면 된다.. (동일X. 결과정리, GPU처리 등 세부로직이 더 있음)
batched_data = some_function(trainer_input)
for single_batch in batched_data:
    collated_data = data_collator(single_batch)
    model(**collated_data)
```

:::

:::{.callout-note}
### `trainer.predict()` 의 분해

`trainer.predict()`의 동작은 개념적으로 (1) 배치화 (2) 데이터콜렝팅 (3) 추론의 과정으로 분해할 수 있지만, 실제이러한 과정으로 코드를 정확하게 분리하는건 어렵다. (실제로도 별로 그럴 이유가 없다) 하지만 이해를 위해서 코드조각을 분리할 필요가 있는데 아래의 3개 코드조각은 이러한 분해를 최대한 비슷하게 수동구현한 것이다. 

`1`. 배치화: `trainer_input` $\to$ `batched_data`

```Python
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x
)
_batched_data = batch_maker.get_test_dataloader(trainer_input)
batched_data = list(_batched_data)
```

`2`. 데이터콜렉팅: `single_batch` $\to$ `collated_data`

```Python
#for single_batch in batched_data:
    collated_data = data_collator(single_batch)
```

`3`. 추론: `collated_data` $\to$ `model_out`

```Python
#for single_batch in batched_data:
    #collated_data = data_collator(single_batch)
    model_out = model(**collated_data)
```

:::

## C. FOOD101 -- 복습 

ref: <https://huggingface.co/docs/transformers/tasks/image_classification>

*1. 데이터준비: `"guebin/food101-tiny"` $\to$ `trainer_input`*

In [29]:
food = datasets.load_dataset("guebin/food101-tiny")
image_processor = transformers.AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
normalize = torchvision.transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(size), 
    torchvision.transforms.ToTensor(), 
    normalize
])
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples
trainer_input = food['train'].with_transform(transforms)
trainer_input

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Dataset({
    features: ['image', 'label'],
    num_rows: 10
})

*2. 모델준비: `"google/vit-base-patch16-224-in21k"` $\to$`model`*

In [31]:
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
model = transformers.AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*3. 데이터콜렉터: `DefaultDataCollator()` $\to$ `data_collator`*

In [32]:
data_collator = transformers.DefaultDataCollator()
data_collator

DefaultDataCollator(return_tensors='pt')

---

데이터콜렉터가 올바로 설정되었는지 체크하고, 적당한 `trainer`를 만들어 

```Python
trainer.predict(trainer_input)
```

이 정상동작하는지 확인하라. 

`(풀이1)` -- 실패

In [33]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x
) # 이 과정에서 model이 cuda로 감 
_batched_data = batch_maker.get_test_dataloader(trainer_input) # 이 과정에서 trainer_input이 cuda로 감
batched_data = list(_batched_data)
single_batch = batched_data[0]
model.to("cpu") # 경우에 따라 생략해야할수도있음
model(**data_collator(single_batch))

KeyError: 'image'

`-` 왜 실패했지?? (예전에는 분명히 되었던 것 같은뎅..)

:::{.callout-note}
**<에러메시지의 해석>**

`-` 아래가 동작하지 않음. 

```Python
batched_data = list(_batched_data)
```

`-` 그 이유는 아래가 동작하지 않기 때문임. 

```Python 
next(dataloader_iter)
```

`-` ...(생략)...

`-` 최종적으로는 아래가 동작하지 않기 때문에 생긴 문제였음. (그런데 이건 `.with_transform()`에 있는 코드인데?)

```Python
examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
```

`-` 결국 

```Python
[_transforms(img.convert("RGB")) for img in examples["image"]]
```

를 실행하는 시점에서 `examples["image"]`가 없었다는 의미.
:::

> 눈치: `with_transform`이 지금 실행되는거였어?

`-` 왜 이런일이 생기지? 

`-` 배치화를 하는 코드 

```Python
_batched_data = batch_maker.get_test_dataloader(trainer_input)
```

에서 아래의 column_names: 

- `pixel_values`
- `head_mask`
- `labels`
- `output_attentions`
- `output_hidden_states`
- `interpolate_pos_encoding`
- `return_dict`

를 제외하고는 모두 트레이너(`batch_maker = trainer`)가 강제로 제거하는 로직이 있음.^[왜 이런 로직이 있을까? 이런 로직이 없다면 model의 args를 강제로 외우고 있어야 하니까..]

`-` `image`라는 column_name은 위에 해당되지 않으므로 제거됨. 

`-` 그리고 `image` 칼럼이 제거된 이후에 `with_transform` 이 나중에 실행되면서 (지연실행) 문제가 발생. 

> 이걸 어떻게 알았냐고요? 코드뜯어봤습니다.. $\to$ 숙제

:::{.callout-note}
### 중간정리

`trainer.predict()` 은 (1) 배치화 (2) 데이터콜렉팅 (3) 추론의 과정을 거친다. 그리고 배치화와 데이터콜렉팅 사이에 "싱글배치"를 만드는 과정이 있다. 

- 세부사항1: 그런데 "**배치화**"단계에서 `model.forward()`의 입력으로 사용되지 않는 columns는 지워지는 내부로직이 존재한다. 
- 세부사항2: `trainer_input`에 걸려있는 `.with_transform()`은 "**배치화**"이후 싱글배치가 만들어지는 과정에서 실행된다. 


따라서 `.with_transform()` 에서 특정컬럼의 변화시키는 동작이 약속된 경우, 그 컬럼이 **배치화**의 단계에서 자동제거되어 코드가 돌아가지 않을 수 있는 위험성이 존재한다. 


:::

`(풀이2)` -- `image`를 `return_dict` 로 위장.. // 완전 테크니컬한 풀이

`-` 현재상황: `food['train']`에 `.with_transform(transforms)`을 걸어두고(?) `trainer_input`을 만든상황 

`-` 문제: `trainer.predict()` 내부동작에서 `.with_transform(transform)` 이 실현될때 

In [272]:
transforms??

[0;31mSignature:[0m [0mtransforms[0m[0;34m([0m[0mexamples[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m <no docstring>
[0;31mSource:[0m   
[0;32mdef[0m [0mtransforms[0m[0;34m([0m[0mexamples[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m    [0mexamples[0m[0;34m[[0m[0;34m"pixel_values"[0m[0;34m][0m [0;34m=[0m [0;34m[[0m[0m_transforms[0m[0;34m([0m[0mimg[0m[0;34m.[0m[0mconvert[0m[0;34m([0m[0;34m"RGB"[0m[0;34m)[0m[0;34m)[0m [0;32mfor[0m [0mimg[0m [0;32min[0m [0mexamples[0m[0;34m[[0m[0;34m"image"[0m[0;34m][0m[0;34m][0m[0;34m[0m
[0;34m[0m    [0;32mdel[0m [0mexamples[0m[0;34m[[0m[0;34m"image"[0m[0;34m][0m[0;34m[0m
[0;34m[0m    [0;32mreturn[0m [0mexamples[0m[0;34m[0m[0;34m[0m[0m
[0;31mFile:[0m      /tmp/ipykernel_706133/1515420127.py
[0;31mType:[0m      function

이 내용이 실행되어야하는데, `image`는 model의 입력으로 유하하지 않은 키라서 트레이너가 이미 제거한 상태임.

`-` 전략: 제거가 안되게 막아보자..

In [276]:
#model.forward?

In [257]:
trainer_input

Dataset({
    features: ['image', 'label'],
    num_rows: 10
})

In [258]:
trainer_input2 = trainer_input.rename_columns({'image':'return_dict'})
trainer_input2

Dataset({
    features: ['return_dict', 'label'],
    num_rows: 10
})

In [267]:
trainer_input2[0]

{'label': 6,
 'pixel_values': tensor([[[-0.7882, -0.7882, -0.7882,  ..., -0.7725, -0.7647, -0.7647],
          [-0.7882, -0.7961, -0.8039,  ..., -0.7569, -0.7569, -0.7569],
          [-0.8039, -0.7804, -0.7804,  ..., -0.7490, -0.7490, -0.7569],
          ...,
          [-0.2784, -0.2627, -0.2471,  ..., -0.0667, -0.1608, -0.2000],
          [-0.2627, -0.2235, -0.2157,  ..., -0.1843, -0.1451, -0.0980],
          [-0.3020, -0.2549, -0.2706,  ..., -0.1608, -0.1686, -0.1216]],
 
         [[-0.7804, -0.7804, -0.7725,  ..., -0.8275, -0.8275, -0.8275],
          [-0.7804, -0.7882, -0.7882,  ..., -0.8118, -0.8196, -0.8196],
          [-0.7961, -0.7647, -0.7647,  ..., -0.8039, -0.8118, -0.8196],
          ...,
          [-0.3098, -0.3176, -0.3176,  ..., -0.1608, -0.2549, -0.2941],
          [-0.2941, -0.2706, -0.2784,  ..., -0.2706, -0.2235, -0.1843],
          [-0.3333, -0.3020, -0.3255,  ..., -0.2235, -0.2314, -0.1843]],
 
         [[-0.8353, -0.8353, -0.8353,  ..., -0.9059, -0.9059, -0.9137],

In [260]:
def transforms2(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["return_dict"]]
    del examples["return_dict"]
    return examples
trainer_input3 = trainer_input2.with_transform(transforms2)
trainer_input3

Dataset({
    features: ['return_dict', 'label'],
    num_rows: 10
})

In [261]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x
)
_batched_data = batch_maker.get_test_dataloader(trainer_input3)
batched_data = list(_batched_data)
single_batch = batched_data[-1]
#model.to("cpu")
model(**data_collator(single_batch))

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


ImageClassifierOutput(loss=tensor(4.7354, device='cuda:0', grad_fn=<NllLossBackward0>), logits=tensor([[-0.0689,  0.1476, -0.1162, -0.1043, -0.0416,  0.0670, -0.1192, -0.1510,
          0.0512, -0.0218,  0.0342, -0.0052, -0.0202, -0.1095, -0.2432, -0.1588,
         -0.1816, -0.0084,  0.0445,  0.0051, -0.0051, -0.3004,  0.2271,  0.0927,
         -0.0244,  0.0604,  0.0328, -0.2132, -0.0526, -0.0161,  0.1167, -0.0621,
         -0.0960,  0.2350,  0.0693, -0.1266,  0.2053,  0.0429,  0.0229,  0.2440,
          0.0103,  0.2298,  0.0807,  0.0563, -0.0894, -0.0288,  0.0675, -0.2696,
          0.1174,  0.0797, -0.0105,  0.1302, -0.0687, -0.1368, -0.0403, -0.0390,
          0.1230, -0.0256,  0.1477, -0.1459, -0.1190, -0.1551, -0.0197,  0.0003,
          0.1087,  0.1157, -0.0924, -0.1210,  0.0742,  0.1664, -0.0729,  0.0445,
          0.0857,  0.1474,  0.1720, -0.0251,  0.1180, -0.0707, -0.0207, -0.0817,
         -0.0022,  0.0356,  0.0499,  0.1388,  0.0806,  0.0158,  0.0330,  0.0186,
          0.09

- 성공..

In [262]:
trainer = transformers.Trainer(
    model=model,
    data_collator=data_collator,
)
trainer.predict(trainer_input3)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).

***** Running Prediction *****
  Num examples = 10
  Batch size = 8


PredictionOutput(predictions=array([[ 0.08239509,  0.04826868, -0.02438424, ..., -0.0937563 ,
        -0.06260583,  0.00630521],
       [-0.08984111,  0.07344969,  0.00403693, ..., -0.02388449,
        -0.13815996,  0.16475692],
       [ 0.07642026, -0.02146504,  0.05515821, ..., -0.12969677,
        -0.05842251,  0.01547094],
       ...,
       [-0.11216477,  0.03565915,  0.00984988, ..., -0.02980242,
        -0.05086675,  0.17964877],
       [-0.06892063,  0.14759967, -0.11619416, ..., -0.00896731,
        -0.18010172,  0.12083632],
       [-0.2107594 ,  0.15095882, -0.08375154, ..., -0.01836788,
        -0.13705489,  0.0361177 ]], dtype=float32), label_ids=array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]), metrics={'test_loss': 4.647078514099121, 'test_model_preparation_time': 0.0013, 'test_runtime': 0.0691, 'test_samples_per_second': 144.64, 'test_steps_per_second': 28.928})

`(풀이3)` -- trainer_input 에 예약된 `with_transform`을 지연실행하지 않고 즉시 실행

In [263]:
trainer_input

Dataset({
    features: ['image', 'label'],
    num_rows: 10
})

In [264]:
trainer_input2 = [l for l in trainer_input]
trainer_input2

[{'label': 6,
  'pixel_values': tensor([[[-0.7882, -0.7882, -0.7882,  ..., -0.7725, -0.7647, -0.7647],
           [-0.7882, -0.7961, -0.8039,  ..., -0.7569, -0.7569, -0.7569],
           [-0.8039, -0.7804, -0.7804,  ..., -0.7490, -0.7490, -0.7569],
           ...,
           [-0.2784, -0.2627, -0.2471,  ..., -0.0667, -0.1608, -0.2000],
           [-0.2627, -0.2235, -0.2157,  ..., -0.1843, -0.1451, -0.0980],
           [-0.3020, -0.2549, -0.2706,  ..., -0.1608, -0.1686, -0.1216]],
  
          [[-0.7804, -0.7804, -0.7725,  ..., -0.8275, -0.8275, -0.8275],
           [-0.7804, -0.7882, -0.7882,  ..., -0.8118, -0.8196, -0.8196],
           [-0.7961, -0.7647, -0.7647,  ..., -0.8039, -0.8118, -0.8196],
           ...,
           [-0.3098, -0.3176, -0.3176,  ..., -0.1608, -0.2549, -0.2941],
           [-0.2941, -0.2706, -0.2784,  ..., -0.2706, -0.2235, -0.1843],
           [-0.3333, -0.3020, -0.3255,  ..., -0.2235, -0.2314, -0.1843]],
  
          [[-0.8353, -0.8353, -0.8353,  ..., -0.9059, 

In [265]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x
)
_batched_data = batch_maker.get_test_dataloader(trainer_input2)
batched_data = list(_batched_data)
single_batch = batched_data[0]
#model.to("cpu")
model(**data_collator(single_batch));

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [266]:
trainer = transformers.Trainer(
    model=model,
    data_collator=data_collator,
)
trainer.predict(trainer_input2); 

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).

***** Running Prediction *****
  Num examples = 10
  Batch size = 8


`(풀이4)` -- 트레이너가 가진 "사용하지 않는 column을 제거하는 기능"을 `False` 시킴..

In [24]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x,
    args = transformers.TrainingArguments(
        output_dir= "asdf", # 아무거나 써야함. 
        remove_unused_columns= False # 이 부분이 포인트!!
    )        
)
_batched_data = batch_maker.get_test_dataloader(trainer_input)
batched_data = list(_batched_data)
single_batch = batched_data[0]
#model.to("cpu")
model(**data_collator(single_batch));

In [25]:
trainer = transformers.Trainer(
    model=model,
    data_collator=data_collator,
    args=transformers.TrainingArguments(
        output_dir= 'asdf', # 아무거나 써야함
        remove_unused_columns= False # 이 부분이 포인트!!
    )        
)
trainer.predict(trainer_input);

`#`

`(풀이5)` -- 트레이너가 가진 "사용하지 않는 column을 제거하는 기능"을 `False` 시킬꺼면, `batch_maker`를 고려할 필요도 없이 아래와 같이 바로 `single_batch`를 얻을 수 있음. 

*풀이4: 실제로 trainer가 싱글배치를 얻는 과정과 유사하게 얻는 방법*

In [41]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x,
    args = transformers.TrainingArguments(
        output_dir= "asdf", # 아무거나 써야함. 
        remove_unused_columns= False # 이 부분이 포인트!!
    )        
)
_batched_data = batch_maker.get_test_dataloader(trainer_input)
batched_data = list(_batched_data)
single_batch = batched_data[-1]
#single_batch

> 형식관찰: `single_batch`는 `[Dict, Dict, Dict, .... Dict]` 꼴임을 주목하라.

*풀이5: 형식관찰에 힌트를 얻어 무식하게 얻은 싱글배치*

In [42]:
trainer_input

Dataset({
    features: ['image', 'label'],
    num_rows: 10
})

In [44]:
single_batch = [
    trainer_input[0],
    trainer_input[1],
    trainer_input[2],
    trainer_input[3],
    trainer_input[4],
    trainer_input[5],
    trainer_input[6],
    trainer_input[7],
]
#single_batch

*아무튼 풀이5 스타일로 싱글배치를 얻었다면? 이후의 코드는 동일*

In [47]:
## 풀이4 
### STEP1: 싱글배치로 체크하기 
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x,
    args = transformers.TrainingArguments(
        output_dir= "asdf",
        remove_unused_columns= False
    )        
)
_batched_data = batch_maker.get_test_dataloader(trainer_input)
batched_data = list(_batched_data)
single_batch = batched_data[0]
#model.to("cpu")
model(**data_collator(single_batch))
### STEP2: 트레이너 설계하고 predict하기
trainer = transformers.Trainer(
    model=model,
    data_collator=data_collator,
    args=transformers.TrainingArguments(
        output_dir= 'asdf',
        remove_unused_columns= False
    )        
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-3.11896242e-02, -2.25847423e-01,  1.63939549e-03, ...,
        -2.29127333e-03,  3.87525409e-02,  4.29060608e-02],
       [ 4.27692719e-02, -1.01897612e-01, -9.16560292e-02, ...,
         4.52993475e-02, -8.15727636e-02, -5.56547865e-02],
       [-5.40109351e-04, -1.02955595e-01, -1.03463437e-02, ...,
        -3.05373464e-02,  9.11735594e-02,  8.82753134e-02],
       ...,
       [ 8.04588795e-02,  2.72037834e-02, -1.16188087e-01, ...,
         6.56688288e-02,  1.24141075e-01,  3.59139368e-02],
       [ 5.94812930e-02, -9.47779790e-02, -2.50497796e-02, ...,
         5.41531555e-02,  6.13874942e-03, -5.53418696e-03],
       [ 7.43777454e-02, -1.62318684e-02,  1.30667584e-04, ...,
        -3.19715589e-04, -9.09084827e-02, -1.61014676e-01]], dtype=float32), label_ids=array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]), metrics={'test_loss': 4.532778739929199, 'test_model_preparation_time': 0.0013, 'test_runtime': 0.0551, 'test_samples_per_second': 181.424, 'test_ste

In [48]:
## 풀이5
### STEP1: 싱글배치로 체크하기 
single_batch =[
    trainer_input[0],
    trainer_input[1],
    trainer_input[2],
    trainer_input[3],
    trainer_input[4],
    trainer_input[5],
    trainer_input[6],
    trainer_input[7],
]
model.to("cpu")
model(**data_collator(single_batch));
### STEP2: 트레이너 설계하고 predict하기
trainer = transformers.Trainer(
    model=model,
    data_collator=data_collator,
    args=transformers.TrainingArguments(
        output_dir= 'asdf', # 아무거나 써야함
        remove_unused_columns= False # 이 부분이 포인트!!
    )        
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-3.11896242e-02, -2.25847423e-01,  1.63939549e-03, ...,
        -2.29127333e-03,  3.87525409e-02,  4.29060608e-02],
       [ 4.27692719e-02, -1.01897612e-01, -9.16560292e-02, ...,
         4.52993475e-02, -8.15727636e-02, -5.56547865e-02],
       [-5.40109351e-04, -1.02955595e-01, -1.03463437e-02, ...,
        -3.05373464e-02,  9.11735594e-02,  8.82753134e-02],
       ...,
       [ 8.04588795e-02,  2.72037834e-02, -1.16188087e-01, ...,
         6.56688288e-02,  1.24141075e-01,  3.59139368e-02],
       [ 5.94812930e-02, -9.47779790e-02, -2.50497796e-02, ...,
         5.41531555e-02,  6.13874942e-03, -5.53418696e-03],
       [ 7.43777454e-02, -1.62318684e-02,  1.30667584e-04, ...,
        -3.19715589e-04, -9.09084827e-02, -1.61014676e-01]], dtype=float32), label_ids=array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]), metrics={'test_loss': 4.532778739929199, 'test_model_preparation_time': 0.0015, 'test_runtime': 0.1057, 'test_samples_per_second': 94.587, 'test_step

*참고1: 아래의 방식으로는 싱글배치를 얻을 수 없음*

In [50]:
#trainer_input[:8]

이유:

In [51]:
trainer_input[:2] == [trainer_input[0], trainer_input[1]]

False

*참고2: 아래의 방식으로도 싱글배치를 얻을 수 없음 -- 이유? 지연실행때문에..*

In [52]:
#trainer_input.to_list()[:8]

## D. FOOD101 -- DefaultDataCollator 구현

*1. 데이터준비: `"guebin/food101-tiny"` $\to$ `trainer_input`*

In [53]:
food = datasets.load_dataset("guebin/food101-tiny")
image_processor = transformers.AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
normalize = torchvision.transforms.Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = torchvision.transforms.Compose([
    torchvision.transforms.RandomResizedCrop(size), 
    torchvision.transforms.ToTensor(), 
    normalize
])
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples
trainer_input = food['train'].with_transform(transforms)
trainer_input

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Dataset({
    features: ['image', 'label'],
    num_rows: 10
})

*2. 모델준비: `"google/vit-base-patch16-224-in21k"` $\to$`model`*

In [54]:
labels = food["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label
model = transformers.AutoModelForImageClassification.from_pretrained(
    "google/vit-base-patch16-224-in21k",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*3. 데이터콜렉터: `collate_fn` 직접설계*

In [55]:
# data_collator = transformers.DefaultDataCollator()
# data_collator

In [56]:
def collate_fn(single_batch):
    pass

`DefaultDataCollator()` 와 동일한 역할을 하는 `collate_fn`을 설계하라. 이를 이용하여 적당한 `trainer`를 만들어 

```Python
trainer.predict(trainer_input)
```

이 정상동작하는지 확인하라. 

`(풀이)`

In [57]:
# batch_maker = transformers.Trainer(
#     model = model,
#     data_collator = lambda x: x,
#     args = transformers.TrainingArguments(
#         output_dir= "asdf", # 아무거나 써야함. 
#         remove_unused_columns= False, # 이 부분이 포인트!!
#     )        
# )
# _batched_data = batch_maker.get_test_dataloader(trainer_input)
# batched_data = list(_batched_data)
# single_batch = batched_data[-1]
# single_batch
#---#
single_batch = [trainer_input[0],trainer_input[1]]
single_batch

[{'label': 6,
  'pixel_values': tensor([[[ 0.2000,  0.1529,  0.0745,  ...,  0.1686,  0.1137,  0.0824],
           [ 0.1608,  0.1451,  0.1294,  ...,  0.1686,  0.1451,  0.1373],
           [ 0.1137,  0.1294,  0.1529,  ...,  0.1608,  0.1686,  0.1765],
           ...,
           [-0.0275, -0.0275, -0.0431,  ..., -0.1294, -0.1451, -0.1608],
           [-0.0980, -0.0745, -0.0431,  ..., -0.1608, -0.1608, -0.1686],
           [-0.1373, -0.1137, -0.0667,  ..., -0.1843, -0.1922, -0.2000]],
  
          [[ 0.1529,  0.1137,  0.0353,  ...,  0.1294,  0.0745,  0.0431],
           [ 0.1216,  0.1137,  0.0902,  ...,  0.1294,  0.1059,  0.0980],
           [ 0.0824,  0.0980,  0.1216,  ...,  0.1216,  0.1294,  0.1373],
           ...,
           [-0.0902, -0.0902, -0.1059,  ..., -0.1373, -0.1529, -0.1686],
           [-0.1608, -0.1373, -0.1059,  ..., -0.1686, -0.1686, -0.1765],
           [-0.2000, -0.1765, -0.1294,  ..., -0.1922, -0.2000, -0.2078]],
  
          [[-0.0510, -0.0980, -0.1765,  ..., -0.0431, 

- 이대로 `model(**sigle_batch)`를 실행하면 에러가 나겠죠? 

In [58]:
def collate_fn(data_collator_input):
    out = dict()
    out['pixel_values'] = torch.stack([l['pixel_values'] for l in data_collator_input])
    out['labels'] = torch.tensor([l['label'] for l in data_collator_input])
    return out                                  
collate_fn(single_batch);

In [59]:
collate_fn(single_batch)

{'pixel_values': tensor([[[[ 0.2000,  0.1529,  0.0745,  ...,  0.1686,  0.1137,  0.0824],
           [ 0.1608,  0.1451,  0.1294,  ...,  0.1686,  0.1451,  0.1373],
           [ 0.1137,  0.1294,  0.1529,  ...,  0.1608,  0.1686,  0.1765],
           ...,
           [-0.0275, -0.0275, -0.0431,  ..., -0.1294, -0.1451, -0.1608],
           [-0.0980, -0.0745, -0.0431,  ..., -0.1608, -0.1608, -0.1686],
           [-0.1373, -0.1137, -0.0667,  ..., -0.1843, -0.1922, -0.2000]],
 
          [[ 0.1529,  0.1137,  0.0353,  ...,  0.1294,  0.0745,  0.0431],
           [ 0.1216,  0.1137,  0.0902,  ...,  0.1294,  0.1059,  0.0980],
           [ 0.0824,  0.0980,  0.1216,  ...,  0.1216,  0.1294,  0.1373],
           ...,
           [-0.0902, -0.0902, -0.1059,  ..., -0.1373, -0.1529, -0.1686],
           [-0.1608, -0.1373, -0.1059,  ..., -0.1686, -0.1686, -0.1765],
           [-0.2000, -0.1765, -0.1294,  ..., -0.1922, -0.2000, -0.2078]],
 
          [[-0.0510, -0.0980, -0.1765,  ..., -0.0431, -0.0980, -0.1294

In [60]:
model.to("cpu")
model(**collate_fn(single_batch))

ImageClassifierOutput(loss=tensor(4.6280, grad_fn=<NllLossBackward0>), logits=tensor([[-0.1365,  0.1620, -0.0223, -0.1143, -0.0091,  0.0963,  0.0333,  0.0601,
          0.0912,  0.0556,  0.0271,  0.1042, -0.0219,  0.0306, -0.0393, -0.0080,
         -0.0542, -0.0108, -0.0006,  0.1087, -0.0748,  0.1145, -0.0873,  0.0175,
          0.0028,  0.0402, -0.0207, -0.1050,  0.0393, -0.1119,  0.0858,  0.0700,
          0.2079,  0.1616,  0.0466,  0.0322, -0.0162,  0.0378, -0.0287, -0.0579,
          0.0594, -0.0013, -0.0045,  0.0189,  0.0158, -0.0060,  0.1940, -0.1585,
          0.0083, -0.1220, -0.1615,  0.0422, -0.0152, -0.0166, -0.1047, -0.0262,
          0.0144,  0.1229, -0.0456,  0.0743,  0.0304, -0.2039,  0.0185,  0.1134,
          0.0305,  0.1549, -0.1589,  0.0945,  0.1259,  0.0856, -0.0828, -0.0904,
         -0.0898, -0.0174,  0.0313,  0.0354,  0.0230, -0.1014,  0.0152,  0.0344,
         -0.1159, -0.1117,  0.0662,  0.1179, -0.0779,  0.1350, -0.0319,  0.0505,
         -0.1316, -0.0474, -0.0

In [61]:
trainer = transformers.Trainer(
    model = model,
    data_collator = collate_fn,
    args = transformers.TrainingArguments(
        output_dir= "asdf", # 아무거나 써야함. 
        remove_unused_columns= False, # 이 부분이 포인트!!
        per_device_eval_batch_size= 2
    )        
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[ 0.03769832,  0.03013101,  0.020927  , ...,  0.12225489,
        -0.06108126, -0.09649463],
       [ 0.01888916,  0.04456399, -0.03133049, ...,  0.09108149,
        -0.03305664, -0.00110111],
       [ 0.08471692,  0.04713244, -0.07810733, ..., -0.00054852,
        -0.09846766,  0.08669026],
       ...,
       [ 0.03736099, -0.06543357,  0.05543617, ..., -0.06087255,
         0.08601949,  0.05294777],
       [-0.02205896,  0.03141039,  0.00066247, ...,  0.17388818,
        -0.01842239,  0.02367421],
       [-0.00704368,  0.0352019 , -0.14236571, ...,  0.03917494,
        -0.04025203,  0.07081404]], dtype=float32), label_ids=array([6, 6, 6, 6, 6, 6, 6, 6, 6, 6]), metrics={'test_loss': 4.671304225921631, 'test_model_preparation_time': 0.0023, 'test_runtime': 0.0835, 'test_samples_per_second': 119.83, 'test_steps_per_second': 59.915})

---

## E. IMDB -- DataCollatorWithPadding 구현

ref: <https://huggingface.co/docs/transformers/tasks/sequence_classification>

*1. 데이터준비: `"guebin/imdb-tiny"` $\to$ `trainer_input`*

In [62]:
imdb = datasets.load_dataset("guebin/imdb-tiny")
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") 
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)
tokenized_imdb = imdb.map(preprocess_function,batched=True)
trainer_input = tokenized_imdb['train']

*2. 모델준비: `"distilbert/distilbert-base-uncased"` $\to$`model`*

In [63]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


*3. 데이터콜렉터: `collate_fn` 직접설계* 

In [64]:
# data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)
# data_collator

In [65]:
def collate_fn(single_batch):
    pass

---

`DefaultDataCollator()` 와 동일한 역할을 하는 `collate_fn`을 설계하라. 이를 이용하여 적당한 `trainer`를 만들어 

```Python
trainer.predict(trainer_input)
```

이 정상동작하는지 확인하라. 

`(풀이)`

In [66]:
trainer_input

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [67]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator = lambda x: x 
)
_batched_data = batch_maker.get_test_dataloader(trainer_input)
batched_data = list(_batched_data)
single_batch = batched_data[-1]
show(single_batch)

List Overview:
Total items: 2

1. list[0]
   - Type: dict
   - Length: 3
   - Values: {'label': 0, 'input_ids': [101, 2040, 2024, 2122, 1000, 2027, 1000, 1011, 1996, 5889, 1029, 1996, 16587, 1029, 5121, 2481, 1005, 1056, 2022, 1996, 4378, 1011, 2023, 2003, 2426, 1996, 2087, 2250, 1011, 23893, 2098, 5453, 1999, 4598, 1012, 2009, 1005, 1055, 1996, 2785, 1997, 3185, 2008, 3504, 2066, 2009, 2001, 1037, 2843, 1997, 4569, 2000, 5607, 2205, 2172, 4569, 1010, 6343, 2003, 2893, 2151, 5025, 2147, 2589, 1010, 1998, 2008, 2471, 2467, 3084, 2005, 1037, 3185, 2008, 1005, 1055, 2053, 4569, 2000, 3422, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 23168, 2123, 2015, 7877, 2061, 2004, 2000, 8691, ... 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

In [68]:
model_input = dict()
model_input['labels'] = torch.tensor([l['label'] for l in single_batch])
model_input['labels']

tensor([0, 0])

In [69]:
input_ids_list = [torch.tensor(l['input_ids']) for l in single_batch]
model_input['input_ids'] = torch.nn.utils.rnn.pad_sequence(input_ids_list).t()
model_input['input_ids']

tensor([[  101,  2040,  2024,  ..., 22132,  7847,   102],
        [  101,  2023,  2003,  ...,     0,     0,     0]])

In [70]:
attention_mask_list = [torch.tensor(l['attention_mask']) for l in single_batch]
model_input['attention_mask'] = torch.nn.utils.rnn.pad_sequence(attention_mask_list).t()
model_input['attention_mask']

tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])

In [71]:
def collate_fn(single_batch):
    model_input = dict()
    model_input['labels'] = torch.tensor([l['label'] for l in single_batch])
    input_ids_list = [torch.tensor(l['input_ids']) for l in single_batch]
    model_input['input_ids'] = torch.nn.utils.rnn.pad_sequence(input_ids_list).t()
    attention_mask_list = [torch.tensor(l['attention_mask']) for l in single_batch]
    model_input['attention_mask'] = torch.nn.utils.rnn.pad_sequence(attention_mask_list).t()
    return model_input

In [72]:
model.to("cpu")
model(**collate_fn(single_batch))

SequenceClassifierOutput(loss=tensor(0.6734, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0025, -0.0352],
        [ 0.0139, -0.0282]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [73]:
trainer = transformers.Trainer(
    model = model,
    data_collator = collate_fn
)

In [74]:
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[ 0.08966769, -0.06166375],
       [ 0.01500158, -0.06875467],
       [ 0.03916247, -0.0061186 ],
       [ 0.0405627 , -0.05440582],
       [-0.0063449 , -0.03747641],
       [ 0.03792842, -0.04466516],
       [ 0.03690895, -0.08592646],
       [ 0.00534204, -0.05428012],
       [ 0.00248959, -0.03522498],
       [ 0.01390314, -0.02818924]], dtype=float32), label_ids=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), metrics={'test_loss': 0.6564630270004272, 'test_model_preparation_time': 0.0012, 'test_runtime': 0.0366, 'test_samples_per_second': 273.189, 'test_steps_per_second': 54.638})

# 4. 연습 -- `sms_spam`

In [75]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2
)
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
spam = datasets.load_dataset('guebin/spam-tiny')
spam

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 10
    })
})

## A. 방법1: 고정패딩, `collate_fn`

In [76]:
spam

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 10
    })
})

In [77]:
def m_transform_batch(example_batch):
    # example_batch = {'sms':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}
    result = tokenizer(example_batch['sms'],padding=True)
    return result

In [78]:
spam2 = spam.map(m_transform_batch,batched=True,batch_size=8)
spam2.set_format("pt")
spam2['train'][:8]['input_ids']

tensor([[  101,  2175,  2127, 18414, 17583,  2391,  1010,  4689,  1012,  1012,
          2800,  2069,  1999, 11829,  2483,  1050,  2307,  2088,  2474,  1041,
         28305,  1012,  1012,  1012, 25022,  2638,  2045,  2288, 26297, 28194,
          1012,  1012,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  2546,
          1057,  2006,  2072,  1012,  1012,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2489,  4443,  1999,  1016,  1037,  1059,  2243,  2135,  4012,
          2361,  2000,  266

- 이때 `input_ids`, `attention_mask`는 매트릭스 O

In [79]:
spam2['train'][:9]['input_ids']

[tensor([  101,  2175,  2127, 18414, 17583,  2391,  1010,  4689,  1012,  1012,
          2800,  2069,  1999, 11829,  2483,  1050,  2307,  2088,  2474,  1041,
         28305,  1012,  1012,  1012, 25022,  2638,  2045,  2288, 26297, 28194,
          1012,  1012,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 tensor([  101,  7929,  2474,  2099,  1012,  1012,  1012, 16644, 15536,  2546,
          1057,  2006,  2072,  1012,  1012,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]),
 tensor([  101,  2489,  4443,  1999,  1016,  1037,  1059,  2243,  2135,  4012,
          2361,  2000,  2

- 이때 `input_ids`, `attention_mask`는 매트릭스 X

In [80]:
spam2['train'][8:]['input_ids'] # 텐서로 묶임..

tensor([[  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,  2017,
          2031,  2042,  3479,  2000,  4374,  2050,  1069, 21057,  2692,  3396,
         10377,   999,  2000,  4366,  2655,  5641,  2692,  2575, 16576, 24096,
         21472,  2487,  1012,  4366,  3642,  1047,  2140, 22022,  2487,  1012,
          9398,  2260,  2847,  2069,  1012,   102],
        [  101,  2018,  2115,  4684,  2340,  2706,  2030,  2062,  1029,  1057,
          1054,  4709,  2000, 10651,  2000,  1996,  6745,  6120,  4684,  2015,
          2007,  4950,  2005,  2489,   999,  2655,  1996,  4684, 10651,  2522,
          2489,  2006,  5511,  8889, 24594, 20842,  2692, 14142,   102,     0,
             0,     0,     0,     0,     0,     0]])

- 이때 `input_ids`, `attention_mask`는 매트릭스 O

In [81]:
trainer_input = spam2['train'].remove_columns('sms').rename_columns({'label':'labels'})
trainer_input

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 10
})

In [82]:
batch_maker = transformers.Trainer(
    model=model,
    data_collator=lambda x: x,
)
batched_data = list(batch_maker.get_test_dataloader(trainer_input))
single_batch = batched_data[-1]
single_batch

[{'labels': tensor(1, device='cuda:0'),
  'input_ids': tensor([  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,  2017,
           2031,  2042,  3479,  2000,  4374,  2050,  1069, 21057,  2692,  3396,
          10377,   999,  2000,  4366,  2655,  5641,  2692,  2575, 16576, 24096,
          21472,  2487,  1012,  4366,  3642,  1047,  2140, 22022,  2487,  1012,
           9398,  2260,  2847,  2069,  1012,   102], device='cuda:0'),
  'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         device='cuda:0')},
 {'labels': tensor(1, device='cuda:0'),
  'input_ids': tensor([  101,  2018,  2115,  4684,  2340,  2706,  2030,  2062,  1029,  1057,
           1054,  4709,  2000, 10651,  2000,  1996,  6745,  6120,  4684,  2015,
           2007,  4950,  2005,  2489,   999,  2655,  1996,  4684, 10651,  2522,
           2489,  2006,  5511,  8889, 24594, 20842,  2692,

In [83]:
def collate_fn(single_batch):
    out = dict()
    out['labels'] = torch.tensor([dct['labels'] for dct in single_batch])
    out['input_ids'] = torch.stack([dct['input_ids'] for dct in single_batch])
    out['attention_mask'] = torch.stack([dct['attention_mask'] for dct in single_batch])
    return out 


In [84]:
collate_fn(single_batch)

{'labels': tensor([1, 1]),
 'input_ids': tensor([[  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,  2017,
           2031,  2042,  3479,  2000,  4374,  2050,  1069, 21057,  2692,  3396,
          10377,   999,  2000,  4366,  2655,  5641,  2692,  2575, 16576, 24096,
          21472,  2487,  1012,  4366,  3642,  1047,  2140, 22022,  2487,  1012,
           9398,  2260,  2847,  2069,  1012,   102],
         [  101,  2018,  2115,  4684,  2340,  2706,  2030,  2062,  1029,  1057,
           1054,  4709,  2000, 10651,  2000,  1996,  6745,  6120,  4684,  2015,
           2007,  4950,  2005,  2489,   999,  2655,  1996,  4684, 10651,  2522,
           2489,  2006,  5511,  8889, 24594, 20842,  2692, 14142,   102,     0,
              0,     0,     0,     0,     0,     0]], device='cuda:0'),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1,

In [85]:
trainer = transformers.Trainer(
    model = model,
    data_collator = collate_fn
)

In [86]:
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-0.04641762, -0.03605646],
       [-0.02506144, -0.01335288],
       [ 0.02076511, -0.02322948],
       [-0.00962518,  0.03754075],
       [-0.03287388, -0.03730439],
       [-0.02798662, -0.02491809],
       [-0.09155686, -0.04009426],
       [-0.0420014 , -0.02552293],
       [-0.0209422 , -0.01544815],
       [-0.01533531,  0.01857282]], dtype=float32), label_ids=array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1]), metrics={'test_loss': 0.6999672651290894, 'test_model_preparation_time': 0.0022, 'test_runtime': 0.0163, 'test_samples_per_second': 613.543, 'test_steps_per_second': 122.709})

## B. 방법2: 고정패딩, DefaultDataCollator

`-` 방법1과 거의 동일 

In [87]:
spam

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 10
    })
})

In [88]:
def m_transform_batch(example_batch):
    # example_batch = {'sms':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}
    result = tokenizer(example_batch['sms'],padding=True)
    return result
spam2 = spam.map(m_transform_batch,batched=True,batch_size=8)
spam2.set_format("pt")
trainer_input = spam2['train'].remove_columns('sms').rename_columns({'label':'labels'})

# def collate_fn(single_batch):
#     out = dict()
#     out['labels'] = torch.tensor([dct['labels'] for dct in single_batch])
#     out['input_ids'] = torch.stack([dct['input_ids'] for dct in single_batch])
#     out['attention_mask'] = torch.stack([dct['attention_mask'] for dct in single_batch])
#     return out 

data_collator = transformers.DefaultDataCollator()
trainer = transformers.Trainer(
    model = model,
    data_collator = data_collator
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-0.04641762, -0.03605646],
       [-0.02506144, -0.01335288],
       [ 0.02076511, -0.02322948],
       [-0.00962518,  0.03754075],
       [-0.03287388, -0.03730439],
       [-0.02798662, -0.02491809],
       [-0.09155686, -0.04009426],
       [-0.0420014 , -0.02552293],
       [-0.0209422 , -0.01544815],
       [-0.01533531,  0.01857282]], dtype=float32), label_ids=array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1]), metrics={'test_loss': 0.6999672651290894, 'test_model_preparation_time': 0.0011, 'test_runtime': 0.0125, 'test_samples_per_second': 800.333, 'test_steps_per_second': 160.067})

## C. 방법3: 동적패딩, `DataCollatorWithPadding`

In [103]:
spam

DatasetDict({
    train: Dataset({
        features: ['sms', 'label'],
        num_rows: 10
    })
})

In [104]:
def w_transform(examples):
    #examples = {'input_ids':[xxx,xxxx,....], 'label':[yyy,yyyy]}
    out = tokenizer(examples['sms'],truncation=True)
    out['labels'] = torch.tensor(examples['label'])
    return out


In [105]:
trainer_input = spam.with_transform(w_transform)['train']
trainer_input

Dataset({
    features: ['sms', 'label'],
    num_rows: 10
})

In [106]:
batch_maker = transformers.Trainer(
    model = model,
    data_collator= lambda x: x,
    args=transformers.TrainingArguments(
        output_dir = "asdf",
        remove_unused_columns=False
    )
)
_batched_data = batch_maker.get_eval_dataloader(trainer_input)
batched_data = list(_batched_data)
single_batch = batched_data[-1]
#single_batch

In [107]:
data_collator = transformers.DataCollatorWithPadding(tokenizer)
data_collator(single_batch)

{'input_ids': tensor([[  101,  3453,   999,   999,  2004,  1037, 11126,  2897,  8013,  2017,
          2031,  2042,  3479,  2000,  4374,  2050,  1069, 21057,  2692,  3396,
         10377,   999,  2000,  4366,  2655,  5641,  2692,  2575, 16576, 24096,
         21472,  2487,  1012,  4366,  3642,  1047,  2140, 22022,  2487,  1012,
          9398,  2260,  2847,  2069,  1012,   102],
        [  101,  2018,  2115,  4684,  2340,  2706,  2030,  2062,  1029,  1057,
          1054,  4709,  2000, 10651,  2000,  1996,  6745,  6120,  4684,  2015,
          2007,  4950,  2005,  2489,   999,  2655,  1996,  4684, 10651,  2522,
          2489,  2006,  5511,  8889, 24594, 20842,  2692, 14142,   102,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [108]:
model.to("cpu")
model(**data_collator(single_batch))

SequenceClassifierOutput(loss=tensor(0.6834, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0209, -0.0154],
        [-0.0153,  0.0186]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [109]:
trainer = transformers.Trainer(
    model = model,
    data_collator = data_collator,
    args=transformers.TrainingArguments(
        output_dir="asdf",
        remove_unused_columns=False
    )
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-0.04641762, -0.03605646],
       [-0.02506144, -0.01335288],
       [ 0.02076511, -0.02322948],
       [-0.00962518,  0.03754075],
       [-0.03287388, -0.03730439],
       [-0.02798662, -0.02491809],
       [-0.09155686, -0.04009426],
       [-0.0420014 , -0.02552293],
       [-0.0209422 , -0.01544815],
       [-0.01533531,  0.01857282]], dtype=float32), label_ids=array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1]), metrics={'test_loss': 0.6999672651290894, 'test_model_preparation_time': 0.0007, 'test_runtime': 0.0093, 'test_samples_per_second': 1072.877, 'test_steps_per_second': 214.575})

## D. 방법4: 동적패딩, 전처리X $(\star)$

In [110]:
trainer_input = spam['train']
trainer_input

Dataset({
    features: ['sms', 'label'],
    num_rows: 10
})

In [111]:
def collate_fn(single_batch):
    # single_batch = [Dict, Dict, Dict,...]
    out = tokenizer([dct['sms'] for dct in single_batch],padding=True,return_tensors="pt",truncation=True)
    out['labels'] = torch.tensor([dct['label'] for dct in single_batch])
    return out 

In [112]:
trainer = transformers.Trainer(
    model=model,
    data_collator=collate_fn,
    args= transformers.TrainingArguments(
        output_dir= 'asdf',
        remove_unused_columns= False
    )
)
trainer.predict(trainer_input)

PredictionOutput(predictions=array([[-0.04641762, -0.03605646],
       [-0.02506144, -0.01335288],
       [ 0.02076511, -0.02322948],
       [-0.00962518,  0.03754075],
       [-0.03287388, -0.03730439],
       [-0.02798662, -0.02491809],
       [-0.09155686, -0.04009426],
       [-0.0420014 , -0.02552293],
       [-0.0209422 , -0.01544815],
       [-0.01533531,  0.01857282]], dtype=float32), label_ids=array([0, 0, 1, 0, 0, 1, 0, 0, 1, 1]), metrics={'test_loss': 0.6999672651290894, 'test_model_preparation_time': 0.0011, 'test_runtime': 0.012, 'test_samples_per_second': 830.013, 'test_steps_per_second': 166.003})