---
# 설치
---

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


---
# 모델 불러오기
----

In [None]:
# kobert Model 불러오기
from transformers import AutoModelForMaskedLM

model_checkpoint = "klue/bert-base"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading weights file https://huggingface.co/klue/bert-base/resolve/main/pytorch_model.bin from cache at /

In [None]:
# kobert Tokenizer 불러오기 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/klue/bert-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/fbd0b2ef898c4653902683fea8cc0dd99bf43f0e082645b913cda3b92429d1bb.99b3298ed554f2ad731c27cdb11a6215f39b90bc845ff5ce709bb4e74ba45621
Model config BertConfig {
  "_name_or_path": "klue/bert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.21.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32000
}

loading file https://huggingface.co/klue/bert-base/resolve/main/vocab.txt from cache at /root/.cache/hugg

In [None]:
import torch
text = "그는 어제 [MASK]를 먹었다."

inputs = tokenizer(text, return_tensors = 'pt')

token_logits = model(**inputs).logits

mask_token_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]

# torch.topk 주어진 차원을 따라 주어진 텐서 의 k가장 큰 요소를 반환
top_5_tokens = torch.topk(mask_token_logits,5, dim=1).indices[0].tolist()

for token in top_5_tokens:
  print(f">>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}")

>>> 그는 어제 브런치를 먹었다.
>>> 그는 어제 저녁를 먹었다.
>>> 그는 어제 샌드위치를 먹었다.
>>> 그는 어제 샐러드를 먹었다.
>>> 그는 어제 햄버거를 먹었다.


In [None]:
tokenizer.tokenize(text)

['그', '##는', '어제', '[MASK]', '를', '먹', '##었', '##다', '.']

In [None]:
inputs.items()

dict_items([('input_ids', tensor([[   2,  636, 2259, 5538,    4, 1022, 1059, 2359, 2062,   18,    3]])), ('token_type_ids', tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])), ('attention_mask', tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]))])

---
# 데이터 불러오기 
---

In [None]:
import pandas as pd 

data = pd.read_csv('/content/drive/MyDrive/aiffel/AIFFELTON/for_model_labeling_folder/check_jang_0830.csv')
del data['Unnamed: 0']
data = data.sample(frac = 1)
data = data.reset_index(drop = True)
data2 = data[6000 : ]
data = data[:6000]
data

Unnamed: 0,review,taste,quantity,delivery
0,떡볶이는 너무 달고 어묵 국물은 너무 짜고 순대간은 원래 그렇게 부서지는건가요 ㅠ ...,1,0,0
1,짬짜면이 정말 짬뽕 짜장면이네요ㅋㅋ 아니 맞는 말이긴한데 음식사진이랑 차이가 너무 ...,1,0,0
2,먹을만은했어용용요,1,0,0
3,정말 맛있었어요,1,0,0
4,간장은 맛있어요 새콤달콤한 맛으로 양념을 먹는데 매워서 맛을 못느끼겠네요 ㅠ,1,0,0
...,...,...,...,...
5995,두마리치킨이다보니 닭이 작고 작고 오래튀긴듯한 느낌에 다리살도 질기네요 페리카나 양...,1,0,0
5996,내가 먹은 피자중에 최 악 이였다,1,0,0
5997,내가알던 동대문 엽기 떡볶이가아니였어요,0,0,0
5998,삼겹살 먹고 싶으면 추천합니다,1,0,0


In [None]:
data2.to_csv('/content/drive/MyDrive/aiffel/AIFFELTON/for_model_labeling_folder/MLM 후 나머지.csv')
data.to_csv('/content/drive/MyDrive/aiffel/AIFFELTON/for_model_labeling_folder/MLM_dataset.txt')

In [None]:
import numpy as np 

total_data_text = list(data['review'])
# 텍스트데이터 문장길이의 리스트를 생성한 후
num_tokens = [len(tokens) for tokens in total_data_text]
num_tokens = np.array(num_tokens)
# 문장길이의 평균값, 최대값, 표준편차를 계산해 본다. 
print('문장길이 평균 : ', np.mean(num_tokens))
print('문장길이 최대 : ', np.max(num_tokens))
print('문장길이 표준편차 : ', np.std(num_tokens))

# 예를들어, 최대 길이를 (평균 + 2*표준편차)로 한다면,  
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
maxlen = int(max_tokens)
print('pad_sequences maxlen : ', maxlen)
print('전체 문장의 {}%가 maxlen 설정값 이내에 포함됩니다. '.format(np.sum(num_tokens < max_tokens) / len(num_tokens)))

문장길이 평균 :  32.912333333333336
문장길이 최대 :  99
문장길이 표준편차 :  22.871524826493072
pad_sequences maxlen :  78
전체 문장의 0.935%가 maxlen 설정값 이내에 포함됩니다. 


In [None]:
# class CustomDataset(Dataset):
  
#   def __init__(self, data, idx):
#     self.data = data
#     self.idx = idx
  
#   def __len__(self):
#     return len(self.data)

#   def __getitem__(self, idx):
#     text = self.data['review'][idx]
    
#     return {'review' : text}

In [None]:
data_list = []
for i in range(len(data)):
  data_list.append(data['review'][i])

In [None]:
# def tokenize_function(example):
#   result = tokenizer(example['review'])
#   if tokenizer.is_fast: # batch_embedding의 결과에서 생성된 것인지 여부 
#     result['word_ids'] = [result.word_ids(i) for i in range(len(result['input_ids']))]
#   return result

In [None]:
# tokenized_datasets = data_list.map(tokenize_function, batched = True)  
# tokenized_datasets

In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers.tokenization_utils import PreTrainedTokenizer

class LineByLineTextDataset(Dataset):

    def __init__(self, tokenizer: PreTrainedTokenizer, data, block_size: int):

        batch_encoding = tokenizer(data, add_special_tokens=True, truncation=True, max_length=block_size)
        self.examples = batch_encoding["input_ids"]
        # self.examples = batch_encoding["attention_mask"]
        self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
        # self.examples = [{"attention_mask": torch.tensor(e, dtype=torch.long)} for e in self.examples]
        # self.examples = [{'word_ids': batch_encoding.word_ids(i) for i in range(len(self.examples['input_ids']))]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return self.examples[i]

In [None]:
# aa = tokenizer(text, add_special_tokens=True)
# bb = aa["input_ids"]
# bb = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in bb]
# bb

[{'input_ids': tensor(2)},
 {'input_ids': tensor(636)},
 {'input_ids': tensor(2259)},
 {'input_ids': tensor(5538)},
 {'input_ids': tensor(4)},
 {'input_ids': tensor(1022)},
 {'input_ids': tensor(1059)},
 {'input_ids': tensor(2359)},
 {'input_ids': tensor(2062)},
 {'input_ids': tensor(18)},
 {'input_ids': tensor(3)}]

In [None]:
dataset = LineByLineTextDataset(tokenizer = tokenizer,
                                data = data_list,
                                block_size = 512)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer= tokenizer, mlm_probability = 0.15)

In [None]:
from transformers import TrainingArguments

batch_size = 64

###################### 각 에포크에서 훈련 손실을 추적 ######################
logging_steps = len(dataset) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-kobert-review",        # 모델 예측 및 체크포인트가 작성될 출력 디렉토리

    overwrite_output_dir=True,                        # True인 경우 출력 디렉토리의 내용을 덮어쓴다. 
                                                      # 훈련을 덮어씌우며 반복하려먼 True로 설정 필요
                                  
    evaluation_strategy="epoch",                      # 훈련 중에 채택할 평가 전략
                                                      # "no": 교육 중에는 평가를 하지 않는다.
                                                      # "steps": 평가는 매 eval_steps
                                                      # "epoch": 평가는 각 Epoch가 끝날 때 수행
                                  
    learning_rate=2e-5,                               # AdamW의 학습률 
    weight_decay=0.01,                                # 모든 bias 및 LayerNorm 가중치를 제외한 모든 레이어에 적용할 가중치 감쇠
    per_device_train_batch_size=batch_size,           # 배치 크기
    per_device_eval_batch_size=batch_size,            # 배치 크기
    push_to_hub=False,
    fp16=True,                                        # 32비트 훈련 대신 fp16 16비트(혼합) 정밀도 훈련을 사용할지 여부
                                                      # weights를 fp32에서 fp16으로 변환한 후 연산을 수행하고 update 과정에서 다시 fp32로 변환해 weight를 업데이트해주는 방법론
                                                      # 필요한 gradients만 잘 살려서 학습 속도를 높이는 방법
                                  
    logging_steps=logging_steps,                      # 두 로그간에 업데이트 단계 수
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from sklearn.model_selection import train_test_split

train_dataset ,test_dataset = train_test_split(dataset, test_size = 0.1, shuffle = True)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
)

Using cuda_amp half precision backend


In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 600
  Batch size = 64


>>> Perplexity: 162.59


In [None]:
import math

trainer.train()

***** Running training *****
  Num examples = 5400
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 255


Epoch,Training Loss,Validation Loss
1,No log,2.32766
2,2.834200,2.277153
3,2.473700,2.205303


***** Running Evaluation *****
  Num examples = 600
  Batch size = 64


***** Running Evaluation *****
  Num examples = 600
  Batch size = 64
***** Running Evaluation *****
  Num examples = 600
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=255, training_loss=2.5905276429419426, metrics={'train_runtime': 113.6859, 'train_samples_per_second': 142.498, 'train_steps_per_second': 2.243, 'total_flos': 476544646643712.0, 'train_loss': 2.5905276429419426, 'epoch': 3.0})

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

***** Running Evaluation *****
  Num examples = 600
  Batch size = 64


>>> Perplexity: 9.29


---
# pretrained model save
---

In [None]:
model.save_pretrained('/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save')

Configuration saved in /content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/config.json
Model weights saved in /content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/pytorch_model.bin


In [None]:
tokenizer.save_pretrained('/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save')

tokenizer config file saved in /content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/special_tokens_map.json


('/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/tokenizer_config.json',
 '/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/special_tokens_map.json',
 '/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/vocab.txt',
 '/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/added_tokens.json',
 '/content/drive/MyDrive/aiffel/AIFFELTON/TAPT_Model_Save/tokenizer.json')