# 2. Imports 

In [1]:
# !pip uninstall mp2024pkg -y
# !pip install git+https://github.com/guebin/mp2024pkg.git

In [1]:
import pandas as pd
import numpy as np
import datasets 
import transformers
import torch
from mp2024pkg import signature, show
from rich import print as rprint

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = transformers.AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2
)
tokenizer = transformers.AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 3. Dataset 형식이해

In [3]:
emotion = datasets.load_dataset('emotion')
d = emotion['train'].select(range(4))

In [4]:
rprint("emotion['train'].select(range(4))")
show(d) 

List Overview:
Total items: 4

1. list[0]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i didnt feel humiliated', 'label': 0}

2. list[1]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0}

3. list[2]
   - Type: dict
   - Length: 2
   - Values: {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3}

4. list[3]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'label': 2}


`-` 이때 데이터셋은 아래와 같이 length-$n$ list 구조로 이해하는게 편리하다. 

- dataset = [example, example, ..., example]
- example = {'text': xxx, 'label' = yyy} 


`-` 그런데 dataset은 특이하게도 아래의 문법이 동작했었다. 

In [9]:
d['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']

In [10]:
d['label']

[0, 0, 3, 2]

`-` 이러한 결과를 보면 dataset은 마치 dictionary처럼 느껴진다. 실제로 경우에 따라서 dataset을 dictionary처럼 생각해도된다. 

In [11]:
rprint("d.to_dict()")
show(d.to_dict())

Dictionary Overview:
Total keys: 2
Keys: ['text', 'label']

1. dict['text']
   - Type: list
   - Length: 4
   - Values: ['i didnt feel humiliated', 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'im grabbing a minute to post i feel greedy wrong', 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']
2. dict['label']
   - Type: list
   - Length: 4
   - Values: [0, 0, 3, 2]


In [12]:
rprint("d.to_dict()['text']")
show(d.to_dict()['text'])

List Overview:
Total items: 4

1. list[0]
   - Type: str
   - Length: 23
   - Values: i didnt feel humiliated

2. list[1]
   - Type: str
   - Length: 108
   - Values: i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake

3. list[2]
   - Type: str
   - Length: 48
   - Values: im grabbing a minute to post i feel greedy wrong

4. list[3]
   - Type: str
   - Length: 92
   - Values: i am ever feeling nostalgic about the fireplace i will know that it is still on the property


`-` 이때 데이터셋은 아래와 같이 dictionary 구조로 이해하는게 편리하다. 

- datasets = examples = {'text':[xxx,xxxx,xxxxx, ...], 'label':[yyy,yyyy,yyyyy, ....]} 

`-` 딕셔너리를 데이터프레임과 비슷하게 생각할수도 있었다.

In [13]:
dct = {'a':[1,2,3,4], 'b':[2,3,4,5]}
dct

{'a': [1, 2, 3, 4], 'b': [2, 3, 4, 5]}

In [14]:
pd.DataFrame(dct)

Unnamed: 0,a,b
0,1,2
1,2,3
2,3,4
3,4,5


`-` 이 개념을 이용하면 `d` 역시 데이터프레임과 비슷하게 이해할수도 있다. 

In [15]:
pd.DataFrame(d.to_dict())

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2


In [16]:
d.to_pandas()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2


# 4. 쉬운함수들 

## A. `.select()`

`# 예시1`

In [17]:
d = emotion['train'].select(range(8))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 8
})

In [18]:
show(d.select(range(2)))

List Overview:
Total items: 2

1. list[0]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i didnt feel humiliated', 'label': 0}

2. list[1]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0}


In [19]:
show(d.select(range(1,3)))

List Overview:
Total items: 2

1. list[0]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0}

2. list[1]
   - Type: dict
   - Length: 2
   - Values: {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3}


`#`

## B. `.shuffle()`

`# 예시1`

In [20]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [21]:
show(d.shuffle())

List Overview:
Total items: 4

1. list[0]
   - Type: dict
   - Length: 2
   - Values: {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3}

2. list[1]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property', 'label': 2}

3. list[2]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake', 'label': 0}

4. list[3]
   - Type: dict
   - Length: 2
   - Values: {'text': 'i didnt feel humiliated', 'label': 0}


`#`

## C. `.select_columns()`

`# 예시1`

In [22]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [23]:
d.select_columns(['text'])

Dataset({
    features: ['text'],
    num_rows: 4
})

In [24]:
d.select_columns(['label'])

Dataset({
    features: ['label'],
    num_rows: 4
})

In [25]:
d.select_columns(['text','label'])

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`#`

## D. `.set_format()`

`# 예시1`

In [26]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [27]:
d.set_format(type='pandas',columns=['label'])

In [28]:
d['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']

In [29]:
d['label']

0    0
1    0
2    3
3    2
Name: label, dtype: int64

`#`

`# 예시2`

In [30]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [31]:
d.set_format(type='pandas',columns=['label','text'])

In [32]:
d['text']

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
Name: text, dtype: object

In [33]:
d['label']

0    0
1    0
2    3
3    2
Name: label, dtype: int64

`#`

`# 예시3`

In [34]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [35]:
d.set_format(type='pt',columns=['label'])

In [36]:
d['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']

In [37]:
d['label']

tensor([0, 0, 3, 2])

`#`

## E. `.reset_format()`

`# 예시1`

In [38]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [39]:
d.set_format(type='torch',columns=['label'])

In [40]:
d['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing a minute to post i feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property']

In [41]:
d['label']

tensor([0, 0, 3, 2])

In [42]:
d.reset_format()

In [43]:
d['label']

[0, 0, 3, 2]

`#`

# 5. `.map()`

## A. `d.map()`

`# 예제1` -- `.map()`에 대한 이해

아래와 같은 Dataset이 있다고 하자. 

In [44]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.map()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이1)` -- `d.map()`을 사용하지 않은 풀이..

*`d`는 아래와 같은 구조로 이해할 수 있음*

- d = [example_1, example_2, example_3, example_4]
- example_i = {'text': xxx, 'label' = yyy}

*리스트화*

In [45]:
lst = list(d)
lst

[{'text': 'i didnt feel humiliated', 'label': 0},
 {'text': 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'label': 0},
 {'text': 'im grabbing a minute to post i feel greedy wrong', 'label': 3},
 {'text': 'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'label': 2}]

*리스트의 첫 요소에 변환적용*

In [46]:
l = lst[0]
l

{'text': 'i didnt feel humiliated', 'label': 0}

In [47]:
r = tokenizer(l['text'])
r

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

*`l`와 `tokenizer(l['text'])`을 합침*

In [48]:
# 예비학습
{'a':[1,2,3], 'b':[4,5,6]} | {'c':[2,3,4]}

{'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [2, 3, 4]}

In [49]:
show(l | r)

Dictionary Overview:
Total keys: 4
Keys: ['text', 'label', 'input_ids', 'attention_mask']

1. dict['text']
   - Type: str
   - Length: 23
   - Values: i didnt feel humiliated
2. dict['label']
   - Type: int
   - Values: 0
3. dict['input_ids']
   - Type: list
   - Length: 7
   - Values: [101, 1045, 2134, 2102, 2514, 26608, 102]
4. dict['attention_mask']
   - Type: list
   - Length: 7
   - Values: [1, 1, 1, 1, 1, 1, 1]


*반복*

In [50]:
def m_transform(example):
    # example = l = {'text': xxx, 'label':yyy} 
    result = tokenizer(example['text'])
    return result

In [51]:
lst2 = [l | m_transform(l) for l in lst]

In [52]:
d2 = datasets.Dataset.from_list(lst2)
d2

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [53]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [54]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`(풀이2)`

In [55]:
def m_transform(example):
    # example = l = {'text': xxx, 'label':yyy} 
    rsult = tokenizer(example['text'])
    return rsult

In [56]:
d2 = d.map(m_transform)

In [57]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [58]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`#`

:::{.callout-note}

**`.map()`의 특징**

- 특징1: `.map()`은 입력으로 `example = {'text':xxx, 'label':yyy}` 꼴을 가정한다. 
- 특징2: `.map()`은 변환전 dict와 변환후 dict를 합친다. 

## B. `dd.map()`

`# 예제1` -- `dd`에도 `.map`을 적용할 수 있음

아래와 같은 DatasetDict가 있다고 하자.

In [59]:
dd = datasets.DatasetDict({
    'train':emotion['train'].select(range(4)),
    'test':emotion['test'].select(range(4)),
})
dd

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4
    })
})

`dd.map()`을 이용하여 아래와 같이 변환하라. 


|tr/test|데이터|변환전|변환후|
|:-|:-|:-|:-|
|train|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|train|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|
|test|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|test|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이)`

In [60]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    return result

In [61]:
dd.map(m_transform)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4
    })
})

`#`

## C. `d.map(batch=True)`

`# 예제1` -- `d.map(batch=True)`의 이해 

아래와 같은 Dataset이 있다고 하자. 

In [62]:
d = emotion['train'].select(range(8))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 8
})

`d.map(batch=True)`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|특이사항|
|:-|:-|:-|:-|
|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|변환시 2개의 observation씩 묶어서 패딩|
|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|변환시 2개의 observation씩 묶어서 패딩|

`(풀이1)` -- 실패

In [63]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy}
    result = tokenizer(example['text'],padding=True)
    return result

In [64]:
d2 = d.map(m_transform)
d2

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1205.26 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8
})

In [65]:
rprint("d2[:4]['input_ids']")
show(d2[:4]['input_ids'])

List Overview:
Total items: 4

1. list[0]
   - Type: list
   - Length: 7
   - Values: [101, 1045, 2134, 2102, 2514, 26608, 102]

2. list[1]
   - Type: list
   - Length: 23
   - Values: [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]

3. list[2]
   - Type: list
   - Length: 12
   - Values: [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102]

4. list[3]
   - Type: list
   - Length: 22
   - Values: [101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996, 13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996, 3200, 102]


`(풀이2)` -- 성공

In [66]:
# def m_transform(example):
#     # example = {'text': xxx, 'label': yyy}
#     result = tokenizer(example['text'], padding=True)
#     return result

def m_transform_batch(example_batch):
    #example_batch = {'text': [xxx,xxxx], 'label': [yyy,yyyy]}
    result = tokenizer(example_batch['text'], padding=True)
    return result

In [67]:
d2 = d.map(m_transform_batch,batch_size=2,batched=True)
d2

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1426.51 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 8
})

In [68]:
rprint("d2[:4]['input_ids']")
show(d2[:4]['input_ids'])

List Overview:
Total items: 4

1. list[0]
   - Type: list
   - Length: 23
   - Values: [101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

2. list[1]
   - Type: list
   - Length: 23
   - Values: [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]

3. list[2]
   - Type: list
   - Length: 22
   - Values: [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

4. list[3]
   - Type: list
   - Length: 22
   - Values: [101, 1045, 2572, 2412, 3110, 16839, 9080, 12863, 2055, 1996, 13788, 1045, 2097, 2113, 2008, 2009, 2003, 2145, 2006, 1996, 3200, 102]


`#`

:::{.callout-note}

**`.map(batch=True)`의 특징**

- 특징1: `.map(batch=True)`은 입력으로 `example_batch = {'text':[xxx,xxxx,...], 'label':[yyy,yyyy,...]}` 꼴을 가정한다. 
- 특징2: `example_batch`는 `batch_size` 만큼 데이터가 있다고 생각한다. `


## D. `d.map()` + 칼럼선택

`# 예제1` -- attention_maks 제외

아래와 같은 Dataset이 있다고 하자. 

In [69]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.map()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>|
|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>|

`(풀이1)`

In [70]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    del result['attention_mask'] 
    return result

In [71]:
d2 = d.map(m_transform)
d2

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 761.53 examples/s]


Dataset({
    features: ['text', 'label', 'input_ids'],
    num_rows: 4
})

In [72]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102]}

In [73]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]]}

`(풀이2)`

In [74]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    return result

In [75]:
d2 = d.map(m_transform)
d2 = d2.select_columns(['text','label','input_ids'])
d2

Dataset({
    features: ['text', 'label', 'input_ids'],
    num_rows: 4
})

In [76]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102]}

In [77]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]]}

`#`

`# 예제2` -- text 제외

아래와 같은 Dataset이 있다고 하자. 

In [78]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.map()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|d[:1]|text: [str]<br/>label: [int]|label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이1)`

In [79]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    del example['text'] 
    return result

In [80]:
d2 = d.map(m_transform)
d2

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [81]:
d2[0]

{'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [82]:
d2[:1]

{'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`(풀이2)`

In [83]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'],padding=True)
    return result

In [84]:
d2 = d.map(m_transform)
d2.select_columns(['text','label','input_ids'])
d2

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [85]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [86]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`#`

:::{.callout-note}

**`.map()`에서 컬럼을 제외하려면?**

- `del`을 이용한 풀이: 제외하고자 하는 column이 `example`에 있을 경우, `result`에 있을 경우 미묘하게 다름. 
- `select`를 이용한 풀이: 제외하고자 하는 column이 `example`에 있든지 `result`에 있든지 상관없음. 


## E. `d.map()` + 타입변환 ($\star$)

`# 예제1` -- `.map()`을 이용한 타입변환은 불가능

아래와 같은 Dataset이 있다고 하자. 

In [87]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.map()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|label: int<br/>input_ids: tensor([int,...,int])<br/>attention_mask: tensor([int,...,int])|
|d[:1]|text: [str]<br/>label: [int]|label:[int]<br/>input_ids: tensor([[int,...,int]])<br/>attention_mask: tensor([[int,...,int]])|

`(풀이1)` -- 실패

In [88]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    del example['text']
    result['input_ids'] = torch.tensor(result['input_ids'])
    result['attention_mask'] = torch.tensor(result['attention_mask'])
    return result

In [89]:
d2 = d.map(m_transform)
d2

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [90]:
d2[0]

{'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [91]:
d2[:1]

{'label': [0],
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`(풀이2)` -- 실패 

In [92]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'], return_tensors='pt')
    del example['text']
    return result

In [93]:
d2 = d.map(m_transform)
d2

Dataset({
    features: ['label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [94]:
d2[0]

{'label': 0,
 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

In [95]:
d2[:1]

{'label': [0],
 'input_ids': [[[101, 1045, 2134, 2102, 2514, 26608, 102]]],
 'attention_mask': [[[1, 1, 1, 1, 1, 1, 1]]]}

> 더 이상해짐.. 차원까지 이상함.. 차원이 문제가 아니고 형태가 아예 안바뀜.. 

`(풀이3)` -- 성공??

In [96]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    return result

In [97]:
d2 = d.map(m_transform)
d2

Dataset({
    features: ['text', 'label', 'input_ids', 'attention_mask'],
    num_rows: 4
})

In [98]:
d2.set_format(type='pt',columns=['input_ids','attention_mask'])

In [99]:
d2[0]

{'input_ids': tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1])}

In [100]:
d2[:1]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [101]:
d2[:2] # 희한하게 나오네..

{'input_ids': [tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
  tensor([  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
           9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
           2003,  8300,   102])],
 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}

`#`

# 6. `.with_transform()`

## A. `d.with_transform()`

`# 예제1` 

아래와 같은 Dataset이 있다고 하자. 

In [103]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.with_transform()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이)`

In [104]:
def w_transform(examples):
    result = tokenizer(examples['text'])
    result = result | examples
    return result

In [105]:
d2 = d.with_transform(w_transform)

In [106]:
d2[0]

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'text': 'i didnt feel humiliated',
 'label': 0}

In [107]:
d2[:1]

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'text': ['i didnt feel humiliated'], 'label': [0]}

`#`

:::{.callout-note}

**`.with_transform()`와 `.map()`의 차이점**

1. `.map()`은 입력으로 example꼴을, `.with_transform()`은 입력으로 examples를 기대한다. 
2. `.map()`은 변환전과 변환후 데이터가 자동으로 합쳐진다. `.with_transform()`은 변환후 데이터만 살아남는다. 
3. `.map()`은 변환이 실제로 이루어진다. `.with_transform()`은 변환이 실제로 이루어지지 않다가 `d[0]`,`d[:1]` 등이 실행하는 순간 이루어진다. 

`# 예제2`

아래와 같은 Dataset이 있다고 하자. 

In [108]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.with_transform()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>|
|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>|

`(풀이)`

In [109]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'])
    result = examples | result 
    del result['attention_mask']
    return result

In [110]:
d2 = d.with_transform(w_transform)

In [111]:
d2[0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102]}

In [112]:
d2[0:1]

{'text': ['i didnt feel humiliated'], 'label': [0], 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]]}

`#`

`# 예제3`

아래와 같은 Dataset이 있다고 하자. 

In [113]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.with_transform()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|d[:1]|text: [str]<br/>label: [int]|label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이)`

In [114]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'])
    result['label'] = examples['label']
    return result

In [115]:
d2 = d.with_transform(w_transform)

In [116]:
d2[0]

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'label': 0}

In [117]:
d2[:1]

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'label': [0]}

`#`

`# 예제4`

아래와 같은 Dataset이 있다고 하자. 

In [118]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

`d.with_transform()`을 이용하여 아래와 같이 변환하라. 


|데이터|변환전|변환후|
|:-|:-|:-|
|d[0]|text: str<br/>label: int|label: tensor(int)<br/>input_ids: tensor([int,...,int])<br/>attention_mask: tensor([int,...,int])|
|d[:1]|text: [str]<br/>label: [int]|label: tensor([int])<br/>input_ids: tensor([[int,...,int]])<br/>attention_mask: tensor([[int,...,int]])|

`(풀이)` -- 실패

In [119]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'])
    result['label'] = torch.tensor(examples['label'])
    result['input_ids'] = torch.tensor(result['input_ids'])
    result['attention_mask'] = torch.tensor(result['attention_mask'])
    return result

In [120]:
d2 = d.with_transform(w_transform)
d2

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [121]:
d2[0]

{'input_ids': tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(0)}

In [122]:
d2[:1]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}

In [123]:
d2[:2]

ValueError: expected sequence of length 7 at dim 1 (got 23)

*에러나는이유*

In [124]:
d[:2]['text']

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake']

In [125]:
result = tokenizer(d[:2]['text'])
result

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [126]:
torch.tensor(result['input_ids'])

ValueError: expected sequence of length 7 at dim 1 (got 23)

- 패딩.........

`(풀이2)` -- 성공

In [127]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'],padding=True)
    result['label'] = torch.tensor(examples['label'])
    result['input_ids'] = torch.tensor(result['input_ids'])
    result['attention_mask'] = torch.tensor(result['attention_mask'])
    return result

In [128]:
d2 = d.with_transform(w_transform)
d2

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [129]:
d2[0]

{'input_ids': tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(0)}

In [130]:
d2[:1]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}

In [131]:
d2[:2]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
          9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
          2003,  8300,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0, 0])}

`(풀이3)` -- 이것도 성공.. 

In [132]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'],padding=True,return_tensors="pt")
    result['label'] = torch.tensor(examples['label'])
    return result

In [133]:
d2 = d.with_transform(w_transform)
d2

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [134]:
d2[0]

{'input_ids': tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1]),
 'label': tensor(0)}

In [135]:
d2[:1]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0])}

In [136]:
d2[:2]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
          9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
          2003,  8300,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'label': tensor([0, 0])}

`#`

## B. `dd.with_transform()`

`# 예제1`

아래와 같은 DatasetDict가 있다고 하자.

In [137]:
dd = datasets.DatasetDict({
    'train':emotion['train'].select(range(4)),
    'test':emotion['test'].select(range(4)),
})
dd

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 4
    })
})

`dd.map()`을 이용하여 아래와 같이 변환하라. 


|tr/test|데이터|변환전|변환후|
|:-|:-|:-|:-|
|train|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|train|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|
|test|d[0]|text: str<br/>label: int|text: str<br/>label: int<br/>input_ids: [int,...,int]<br/>attention_mask: [int,...,int]|
|test|d[:1]|text: [str]<br/>label: [int]|text: [str]<br/>label: [int]<br/>input_ids: [[int,...,int]]<br/>attention_mask: [[int,...,int]]|

`(풀이)`

In [138]:
def w_transform(examples):
    result = tokenizer(examples['text'])
    result = examples | result
    return result

In [139]:
dd2 = dd.with_transform(w_transform)

In [140]:
dd2['train'][0]

{'text': 'i didnt feel humiliated',
 'label': 0,
 'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [141]:
dd2['train'][:1]

{'text': ['i didnt feel humiliated'], 'label': [0], 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

`#`

## C. `d.reset_format()`

`# 예시1` 

In [142]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [143]:
def w_transform(examples):
    result = tokenizer(examples['text'])
    result = result | examples
    return result

In [144]:
d2 = d.with_transform(w_transform)

In [145]:
d2[0]

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'text': 'i didnt feel humiliated',
 'label': 0}

In [146]:
d2[:1]

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'text': ['i didnt feel humiliated'], 'label': [0]}

In [147]:
d2.reset_format()

In [148]:
d2[0]

{'text': 'i didnt feel humiliated', 'label': 0}

In [149]:
d2[:1]

{'text': ['i didnt feel humiliated'], 'label': [0]}

`#`

`# 예시1` 

In [150]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [151]:
def w_transform(examples):
    result = tokenizer(examples['text'])
    result = result | examples
    return result

In [152]:
d2 = d.with_transform(w_transform)

In [153]:
d2[0]

{'input_ids': [101, 1045, 2134, 2102, 2514, 26608, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'text': 'i didnt feel humiliated',
 'label': 0}

In [154]:
d2[:1]

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]], 'text': ['i didnt feel humiliated'], 'label': [0]}

In [155]:
d2.set_format(type="pt")

In [156]:
d2[0]

{'text': 'i didnt feel humiliated', 'label': tensor(0)}

In [157]:
d2[:1]

{'text': ['i didnt feel humiliated'], 'label': tensor([0])}

`#`

# 7. 미묘한 차이 

`# 예시1`

In [158]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [159]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    return result

In [160]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'])
    result = examples| result 
    return result

In [161]:
d2 = d.map(m_transform)
d2.set_format(type="pt")
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': tensor([0]),
 'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [162]:
d3 = d.with_transform(w_transform)
d3[:1]

{'text': ['i didnt feel humiliated'], 'label': [0], 'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}

In [163]:
d3.set_format(type="pt")
d3[:1]

{'text': ['i didnt feel humiliated'], 'label': tensor([0])}

`#`

`# 예시2`

In [164]:
d = emotion['train'].select(range(4))
d

Dataset({
    features: ['text', 'label'],
    num_rows: 4
})

In [165]:
def m_transform(example):
    # example = {'text': xxx, 'label': yyy} 
    result = tokenizer(example['text'])
    return result

In [166]:
def w_transform(examples):
    # examples = {'text': [xxx,xxxx,xxxxx], 'label': [yyy,yyyy,yyyyy]} 
    result = tokenizer(examples['text'],return_tensors="pt")
    result = result | examples
    return result

In [167]:
d2 = d.map(m_transform)
d2.set_format(type="pt")

In [168]:
d2[:1]

{'text': ['i didnt feel humiliated'],
 'label': tensor([0]),
 'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}

In [169]:
d2[:2]

{'text': ['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake'],
 'label': tensor([0, 0]),
 'input_ids': [tensor([  101,  1045,  2134,  2102,  2514, 26608,   102]),
  tensor([  101,  1045,  2064,  2175,  2013,  3110,  2061, 20625,  2000,  2061,
           9636, 17772,  2074,  2013,  2108,  2105,  2619,  2040, 14977,  1998,
           2003,  8300,   102])],
 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1]),
  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}

In [170]:
d2 = d.with_transform(w_transform)
d2[:1]

{'input_ids': tensor([[  101,  1045,  2134,  2102,  2514, 26608,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]), 'text': ['i didnt feel humiliated'], 'label': [0]}

In [171]:
d2[:2]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

`#`