---
### 참고 블로그   

https://neulvo.tistory.com/500  
https://huggingface.co/course/chapter7/3?fw=pt

---

In [77]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [78]:
pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [188]:
pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.12.0-py3-none-any.whl (143 kB)
[K     |████████████████████████████████| 143 kB 4.1 MB/s 
Installing collected packages: accelerate
Successfully installed accelerate-0.12.0


---
# (1) 모델 불러오기 및 MLM 모델 예시
---

In [79]:
# distilbert 모델 불러오기
from transformers import AutoModelForMaskedLM

model_checkpoint = 'distilbert-base-uncased'
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [82]:
# distilbert tokenizer 불러오기 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [80]:
# 파라미터 수 확인 
distilbert_num_parameters = model.num_parameters() / 1_000_000 # =1000000
print(f'distilbert 파라미터 수 : {round(distilbert_num_parameters)}M')
print(f'bert 파라미터 수: 110M')

distilbert 파라미터 수 : 67M
bert 파라미터 수: 110M


---
### example
---

In [81]:
# 예시 문장
text = "This is a great [MASK]."

In [83]:
import torch

inputs = tokenizer(text, return_tensors = 'pt')
    # >>> {'input_ids': tensor([[ 101, 2023, 2003, 1037, 2307,  103, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

token_logits = model(**inputs).logits # ** = dict 형태로 input값을 넣어주겠다 알려주는 역할 
    # >>> token_logits.shape -> torch.Size([1, 8, 30522]) (batch_size, 토큰 길이, Vocab_Size) 
# inputs의 개별 문장들과 vocab 안에 있는 단어들과의 embeddong 값이 산출 

################################## Find the location [MASK] and extract its logits ##################################

mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1] #mask 토큰의 인덱스 위치 
    # >>> (tensor([0]), tensor([5]))
mask_token_logits = token_logits[0, mask_token_index, :] # mask 토큰의임베딩 값 

################################## Pick the [MASK] candidates with the highest logits ##################################

# torch.topk 주어진 차원을 따라 주어진 텐서 의 k가장 큰 요소를 반환
top_5_tokens = torch.topk(mask_token_logits,5, dim=1).indices[0].tolist() # mask 토큰 자리에 들어갈 가장 가까운 embedding값의 token_id 추출
    # >>> [3066, 3112, 6172, 2801, 8658]


for token in top_5_tokens:
  print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> This is a great deal.
>>> This is a great success.
>>> This is a great adventure.
>>> This is a great idea.
>>> This is a great feat.


---
# (2) 학습을 위한 데이터셋 불러오기 
---

In [84]:
from datasets import load_dataset

imdb_dataset = load_dataset("imdb")
imdb_dataset



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [85]:
# random case 뽑아보기 
sample = imdb_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> Review: {row['text']}'")
    print(f"'>>> Label: {row['label']}'")




'>>> Review: There is no relation at all between Fortier and Profiler but the fact that both are police series about violent crimes. Profiler looks crispy, Fortier looks classic. Profiler plots are quite simple. Fortier's plot are far more complicated... Fortier looks more like Prime Suspect, if we have to spot similarities... The main character is weak and weirdo, but have "clairvoyance". People like to compare, to judge, to evaluate. How about just enjoying? Funny thing too, people writing Fortier looks American but, on the other hand, arguing they prefer American series (!!!). Maybe it's the language, or the spirit, but I think this series is more English than American. By the way, the actors are really good and funny. The acting is not superficial at all...'
'>>> Label: 1'

'>>> Review: This movie is a great. The plot is very true to the book which is a classic written by Mark Twain. The movie starts of with a scene where Hank sings a song with a bunch of kids called "when you stu

In [86]:
def tokenize_function(example):
  result = tokenizer(example['text'])
  if tokenizer.is_fast: # batch_embedding의 결과에서 생성된 것인지 여부 
    result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
  return result

################################################################################################
# word_ids 
# 각 토큰에 해당하는 단어를 나타내는 목록. 
# 토크나이저에 의해 추가된 특수 토큰은 None으로 매핑되고 
# 다른 토큰은 해당 단어의 인덱스에 매핑 (여러 토큰이 해당 단어의 일부인 경우 동일한 단어 인덱스에 매핑됨)
# 빠른 토크나이저를 위해 초기 문장에서 토큰을 실제 단어에 매핑하는 목록을 반환
################################################################################################

tokenized_datasets = imdb_dataset.map(tokenize_function, batched = True, remove_columns = ['text','label'])  
tokenized_datasets

################################################################################################
# map 함수

# 첫 번째 매개변수로는 함수
# 두 번째 매개변수로는 반복 가능한 자료형(리스트, 튜플 등)
# 함수의 동작은 두 번째 인자로 들어온 반복 가능한 자료형 (리스트나 튜플)을 첫 번째 인자로 들어온 함수에 하나씩 집어넣어서 함수를 수행하는 함수

# batched = True
# 호출되는 순간마다 여러 개의 예제로 구성된 하나의 배치(batch)가 한번에 map 함수에 입력 
# 배치 크기(batch size)는 별도로 설정이 가능하고 디폴트값은 1000
################################################################################################



  0%|          | 0/25 [00:00<?, ?ba/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (532 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

----
### word_ids 이해를 돕기 위한 코드
----

In [156]:
sentences = ["During the 1980s , life was something else", "An 18th century poet", "this is a great diffi cult difficult"]

e = tokenizer.batch_encode_plus(sentences, return_tensors='pt', padding=True)

In [157]:
print(tokenizer.tokenize(sentences[0]))
print(e.word_ids(0))
print('='*50)
print(tokenizer.tokenize(sentences[1]))
print(e.word_ids(1))
print('='*50)
print(tokenizer.tokenize(sentences[2]))
print(e.word_ids(2)) # diffi = di , ##ffi로 나눠져 같은 4로 매핑이됨 

# 토크나이저로 추가된 특수토큰을 만나게 되면 None으로 산출되고 숫자가 다시 0부터 시작됨(추론)

['during', 'the', '1980s', ',', 'life', 'was', 'something', 'else']
[None, 0, 1, 2, 3, 4, 5, 6, 7, None]
['an', '18th', 'century', 'poet']
[None, 0, 1, 2, 3, None, None, None, None, None]
['this', 'is', 'a', 'great', 'di', '##ffi', 'cult', 'difficult']
[None, 0, 1, 2, 3, 4, 4, 5, 6, None]


In [87]:
# auto-regressive 나 mlm 에서는 보통 모든 examples를 whole corpus로 합친 다음
# 그것을 equal size의 chunk들로 split해주는 방법을 취한다.

# 각 example들을 개별 tokenize 해줄 경우, truncate되어 정보가 손실될 수 있기 때문
# 그래서 truncation=True 옵션을 제외하고 tokenizer를 해주고 필요 없어진 text, label 컬럼을 제거해주었다.

---
### example
---

In [88]:
tokenizer.model_max_length 
    # >>> 512 
chunk_size = 128

In [89]:
################################## Slicing produces a list of lists for each feature ##################################
tokenized_samples = tokenized_datasets['train'][:3]

for idx , sample in enumerate(tokenized_samples['input_ids']):
  print(f"Revuew: {idx} length: {len(sample)}")

Revuew: 0 length: 363
Revuew: 1 length: 304
Revuew: 2 length: 133


In [90]:
concatenated_example = {k : sum(tokenized_samples[k],[]) for k in tokenized_samples.keys()}
                                                              # >>> dict_keys(['input_ids', 'attention_mask', 'word_ids'])

total_length = len(concatenated_example['input_ids'])
print(f'전체 리뷰 길이: {total_length}')

전체 리뷰 길이: 800


In [91]:
chunks = {k : [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
         for k , t in concatenated_example.items()}

for chunk in chunks['input_ids']:
  print(f"chunk length: {len(chunk)}")

#####################################################################################
# Concatenate 된 text를 chunks로 나눈다
# length가 다른 마지막 chunk를 처리하는 방법은
# drop 시키거나 pad 해주는 두 가지 방법이 있는데 여기서는 drop시키는 방법을 택할 것
#####################################################################################

chunk length: 128
chunk length: 128
chunk length: 128
chunk length: 128
chunk length: 128
chunk length: 128
chunk length: 32


In [94]:
def group_texts(examples):
  ######################## concatenate all texts ########################
  concatenated_examples = {k : sum(examples[k], []) for k in examples.keys()}

  ##################### compute length of concatenated texts #####################
  total_length = len(concatenated_examples[list(examples.keys())[0]])

  ##################### we drop the lask chunk if it's smaller than chunk_size #####################
  total_length = (total_length // chunk_size) * chunk_size

  ##################### split by chunk of max_len #####################
  result = {k : [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
            for k ,t in concatenated_examples.items()}

  ##################### create a new labels column #####################
  result['labels'] = result['input_ids'].copy()
  return result 

In [95]:
im_datasets = tokenized_datasets.map(group_texts, batched = True)
im_datasets

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [96]:
################################## 위에서 Input을 Copy해 label에 넣었기 때문에 값은 동일  ##################################
print(tokenizer.decode(im_datasets['train'][1]['input_ids']))
print('='*100)
print(tokenizer.decode(im_datasets['train'][1]['labels']))

as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,
as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot li

----
# (3) DataCollatorForLanguageModeling
----

텍스트의 각 배치에서 일부 토큰을 무작위로 마스킹할 수 있는 특수 데이터 수집기  
언어 모델링에 사용되는 데이터 수집기   
입력이 모두 동일한 길이가 아닌 경우 일괄 처리의 최대 길이까지 입력이 동적으로 채워진다.  

- tokenizer ( PreTrainedTokenizer 또는 PreTrainedTokenizerFast ) — 데이터를 인코딩하는 데 사용되는 토크나이저입니다.

- mlm ( bool, 선택 사항 , 기본값은 True) — 마스크 언어 모델링을 사용할지 여부입니다.   
False로 설정 하면 레이블은 패딩 토큰이 무시된 입력과 동일합니다(-100으로 설정).   
그렇지 않은 경우 레이블은 마스크되지 않은 토큰에 대해 -100이고 마스크된 토큰에 대해 예측할 값입니다.

- mlm_probability ( float, 선택 사항mlm , 기본값은 0.15) — True로 설정된 경우 입력에서 토큰을 (임의로) 마스킹할 확률입니다.

- pad_to_multiple_of ( int, 선택 사항 ) — 설정된 경우 시퀀스를 제공된 값의 배수로 채웁니다.

- return_tensors ( str) — 반환할 Tensor의 유형입니다. 허용되는 값은 "np", "pt" 및 "tf"입니다.

----


In [98]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer= tokenizer, mlm_probability = 0.15)

---
### example
---

---
파이썬 언더스코어(_)의 이유

----

- 인터프리터(Interpreter)에서 마지막 값을 저장할 때  

- 값을 무시하고 싶을 때 (흔히 “I don’t care"라고 부른다.)
- 변수나 함수명에 특별한 의미 또는 기능을 부여하고자 할 때
- 국제화(Internationalization, i18n)/지역화(Localization, l10n) 함수로써 사용할 때
- 숫자 리터럴값의 자릿수 구분을 위한 구분자로써 사용할 때

----

In [130]:
samples = [im_datasets["train"][i] for i in range(3)]

for sample in samples:
    _ = sample.pop("word_ids") 
# id_datasets의 딕셔너리 형태에서 word_ids를 꺼내기

for chunk in data_collator(samples)['input_ids']:
  print(tokenizer.decode(chunk),'\n')


[CLS] i rented i am curious - yellow from my video store because of all the [MASK] that surrounded it when it was first [MASK] in 1967. [MASK] also [MASK] that at first it was seized by u [MASK] s. customs if it ever tried to enter this country [MASK] therefore being a fan of films [MASK] " [MASK] " i [MASK] had to see this for [MASK]. < br / > < br / > the [MASK] [MASK]ハ around a young swedish drama student named lena [MASK] wants to learn strewn she can about life. in [MASK] she wants to focus [MASK] attentions [MASK] making some sort of documentary on what [MASK] average sw [MASK] thought about certain political issues such 

as the vietnam war and race issues in [MASK] united [MASK]. in between asking politicians and [MASK] denizens of stockholm about their opinions on politics, she has sex with her [MASK] teacher, classmates, and married men. < [MASK] / > [MASK] br / > what kills [MASK] about i am curious - yellow is that [MASK] hem ago, this was considered pornographic. really [M

- 랜덤 마스킹의 한 가지 부작용 Trainer은 훈련 및 테스트 세트에 대해 동일한 데이터 콜레이터를 사용하기 때문에 , 사용할 때 평가 메트릭이 결정적이지 않다는 것입니다. 
- 나중에 우리가 🤗 Accelerate로 미세 조정을 할 때 맞춤 평가 루프의 유연성을 사용하여 임의성을 고정하는 방법을 볼 수 있습니다.

---
# (4) Whole word masking data collator
---

In [176]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        ############# Create a map between words and corresponding token indices #############
        mapping = collections.defaultdict(list)
                  # 딕셔너리(dictionary)와 거의 비슷하지만 key값이 없을 경우 미리 지정해 놓은 초기(default)값을 반환하는 dictionary
                  # 이 코드에는 빈 리스트를 반환 

        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        ####################### Randomly mask words #######################
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
              # 이항 분포에서 표본을 추출 
              # n번의 시행, p개의 성공 확률이 있는 이항 분포에서 추출합니다. 여기서 n은 정수 >= 0이고 p는 구간 [0,1]에 있습니다.
              # 출력 형태. 주어진 모양이 예를 들어 (m, n, k) 이면 m * n * k 샘플이 그려집니다.
              
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id

    return default_data_collator(features)

---
### example
---

In [179]:
samples = [im_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> [CLS] i rented [MASK] am curious - yellow from my [MASK] [MASK] because [MASK] [MASK] the controversy [MASK] surrounded it when it was first released [MASK] 1967. i also heard that [MASK] first it was seized [MASK] [MASK]. s. customs if [MASK] [MASK] tried to enter this [MASK], therefore [MASK] a fan of films [MASK] [MASK] controversial " i [MASK] had to see this for [MASK]. < br / > < [MASK] / > the [MASK] is centered around a young swedish drama student [MASK] lena who wants to learn [MASK] she can about life. [MASK] particular she [MASK] to [MASK] [MASK] attentions to making [MASK] sort of documentary on what the average [MASK] [MASK] thought about certain political [MASK] such'

'>>> as the vietnam war and race issues in the united states [MASK] [MASK] between asking politicians [MASK] ordinary denizens of stockholm about their opinions on politics, she has [MASK] with [MASK] drama teacher, classmates, and married men [MASK] < [MASK] / > < br / > what kills me about i [MASK] 

---
# (5) dataset split
---

In [180]:
train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = im_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

---
# (6) TrainingArguments
---

In [181]:
from transformers import TrainingArguments

batch_size = 64

###################### 각 에포크에서 훈련 손실을 추적 ######################
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",        # 모델 예측 및 체크포인트가 작성될 출력 디렉토리

    overwrite_output_dir=True,                        # True인 경우 출력 디렉토리의 내용을 덮어쓴다. 
                                                      # 훈련을 덮어씌우며 반복하려먼 True로 설정 필요
                                  
    evaluation_strategy="epoch",                      # 훈련 중에 채택할 평가 전략
                                                      # "no": 교육 중에는 평가를 하지 않는다.
                                                      # "steps": 평가는 매 eval_steps
                                                      # "epoch": 평가는 각 Epoch가 끝날 때 수행
                                  
    learning_rate=2e-5,                               # AdamW의 학습률 
    weight_decay=0.01,                                # 모든 bias 및 LayerNorm 가중치를 제외한 모든 레이어에 적용할 가중치 감쇠
    per_device_train_batch_size=batch_size,           # 배치 크기
    per_device_eval_batch_size=batch_size,            # 배치 크기
    push_to_hub=False,
    fp16=True,                                        # 32비트 훈련 대신 fp16 16비트(혼합) 정밀도 훈련을 사용할지 여부
                                                      # weights를 fp32에서 fp16으로 변환한 후 연산을 수행하고 update 과정에서 다시 fp32로 변환해 weight를 업데이트해주는 방법론
                                                      # 필요한 gradients만 잘 살려서 학습 속도를 높이는 방법
                                  
    logging_steps=logging_steps,                      # 두 로그간에 업데이트 단계 수
)

In [182]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
)

Using cuda_amp half precision backend


In [183]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

#####################################################################################
# 낮은 perplexity 점수는 더 나은 언어 모델을 의미
# next word의 probability를 계산하여 language model의 성능을 측정하는데
# Perpleixty가 높다는 것은 다음 단어로 의외의 단어가 자주 등장했다는 뜻
# 여기서는 cross-entropy loss의 exponential로 perplexity를 계산
#####################################################################################

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.


>>> Perplexity: 21.94


In [184]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 471
The following columns in the training set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.


Epoch,Training Loss,Validation Loss
1,2.7117,2.497595
2,2.5773,2.424281
3,2.5263,2.434775


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=471, training_loss=2.604766742439027, metrics={'train_runtime': 277.4368, 'train_samples_per_second': 108.133, 'train_steps_per_second': 1.698, 'total_flos': 994208670720000.0, 'train_loss': 2.604766742439027, 'epoch': 3.0})

In [185]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.


>>> Perplexity: 11.86


- 결과값이 21.94에서 11.86으로 감소하였다

---
# (7) Accelerator를 사용해 Randomness를 Freeze
----

즉, DataCollatorForLanguageModeling의 random masking을 제어하기 위한 것  
전체 Dataset에 대해 한번에 masking 해주는 방법으로 randomness를 제거해주고  
그 후에 eval_dataloader에서 default data collator를 사용해 batch를 모아준다.  

----

In [193]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    ################ Create a new "masked" column for each column in the dataset ################
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

- 위 함수를 데이터셋에 적용하고 마스크되지 않은 열을 삭제하여 마스크된 열로 교체

In [194]:
downsampled_dataset = downsampled_dataset.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [197]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

---
# (8) optimizer, accelerator, scheduler 세팅 
----

In [200]:
from transformers.utils.dummy_pt_objects import get_scheduler
from torch.optim import AdamW
from accelerate import Accelerator    # 모든 분산 구성(GPU/TPU/CPU)에서 동일한 PyTorch 코드를 실행할 수 있는 라이브러리
from transformers import get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

num_train_epochs = 3 
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)

---
# 모델 학습
---

In [201]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    ########################### Training ##########################
    model.train() 
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    ########################### Evaluation #########################
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

  0%|          | 0/471 [00:00<?, ?it/s]

>>> Epoch 0: Perplexity: 11.201391776576259
>>> Epoch 1: Perplexity: 10.894306762041227
>>> Epoch 2: Perplexity: 10.70708208174388


---
# (9) model save
---

In [207]:
########################## Save and upload ##########################
output_dir = '/content'

accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
    tokenizer.save_pretrained(output_dir)

Configuration saved in /content/config.json
Model weights saved in /content/pytorch_model.bin
tokenizer config file saved in /content/tokenizer_config.json
Special tokens file saved in /content/special_tokens_map.json
