In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import torch
from tqdm.auto import tqdm
import random
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, GPT2LMHeadModel, PreTrainedTokenizerFast

In [3]:
def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

DATA_PATH = '/content/drive/MyDrive/DACON 경진대회/한솔 도배하자 질의응답/최종 제출/data/'
SEED = 42
reset_seeds(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
# 데이터 로드
train = pd.read_csv(f'{DATA_PATH}train_120000_final.csv')

train.shape

(120000, 3)

In [5]:
model_name = 'skt/kogpt2-base-v2'

tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name,
                                          bos_token='</s>',
                                          eos_token='</s>',
                                          unk_token='<unk>',
                                          pad_token='<pad>',
                                          mask_token='<mask>')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer.json:   0%|          | 0.00/2.83M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [6]:
qna_data = []
for _, row in tqdm(train.iterrows(), total=train.shape[0]):
    input_text = "<q>" + row['question'] + "</s><a>" + row['answer'] + "</s>"
    qna_data.append(input_text)

qna_df = pd.DataFrame({'qna':qna_data})
qna_df

  0%|          | 0/120000 [00:00<?, ?it/s]

Unnamed: 0,qna
0,<q>면진장치가 뭐야?</s><a>면진장치란 지반에서 오는 진동 에너지를 흡수하여 ...
1,"<q>내진설계의 종류 좀 알려줘</s><a>내진 설계의 종류로 내진구조, 제진구조,..."
2,<q>철골구조의 장점이 뭐야?</s><a>철골구조는 건물의 외벽에는 그다지 하중이 ...
3,"<q>철골철근 콘크리트 구조가 뭐야?</s><a>철근철골콘크리트는 철골과 철근, 그..."
4,<q>철골구조는 어떤 방식이 있어?</s><a>철골구조는 일반철골구조와 경량철골구조...
...,...
119995,"<q>새집증후군이 무엇이며, 외부에 소리가 안나가게 하는 목적으로 차음재만 설치해도..."
119996,"<q>써모사이딩이 무엇이며, 인테리어 디자인에서 조명의 역할은 무엇인가요?</s><..."
119997,"<q>훼손이 무엇이고, 공간 내 높은습도로 인해 도배지에 얼룩이 발생할 수 있어?<..."
119998,"<q>새집증후군의 원인이 무엇이고, 벽지에 반점이 생기는 원인이 뭐야?</s><a>..."


In [7]:
class DobaeDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.question = df['qna'].tolist()

    def __len__(self):
        return len(self.question)

    def __getitem__(self, idx):
        return self.question[idx]

In [8]:
def collate_fn(batch):
    x = tokenizer(batch, return_tensors='pt', padding=True)
    return {'x': x}

In [9]:
dt = DobaeDataset(qna_df)
dl = torch.utils.data.DataLoader(dt, batch_size=2, collate_fn=collate_fn)
batch = next(iter(dl))
batch

{'x': {'input_ids': tensor([[ 9724,   455,   405,  7532,  8265, 37765, 46651,  7991,   406,     1,
           9724,   439,   405,  7532,  8265, 20725,  7374,  9027,  7599,  9023,
          14472, 15898, 14820, 33220, 36928, 10764,  9166, 11818, 28037, 10090,
          15898, 34062, 20725, 21154,     1],
         [ 9724,   455,   405,  7071,  8265,  7793, 10346, 10036, 11732,  9666,
           8244,     1,  9724,   439,   405,  7071,  8265,  9160, 10346, 32032,
           9094,  8265, 12791,   387,  9037,  8265, 12791,   387,  9411,  8265,
          27211, 32987,     1,     3,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [10]:
def train_loop(dataloader,model,loss_fn,optimizer,scheduler,device): # scheduler 추가
    epoch_loss = 0
    model.train()
    for batch in tqdm(dataloader):
        x = batch["x"].to(device)
        pred = model(**x).logits
        n_class = pred.shape[-1]
        pred = pred[:, :-1, :]
        pred = pred.reshape(-1, n_class)

        tgt = x["input_ids"][:, 1:]
        tgt = tgt.flatten()

        mask = tgt != tokenizer.pad_token_id
        tgt = tgt[mask]
        pred = pred[mask]
        loss = loss_fn(pred,tgt)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step() # scheduler 추가

        epoch_loss += loss.item()

    epoch_loss /= len(dataloader)

    return epoch_loss

In [11]:
batch_size = 6
# lr = 3e-5
lr = 2e-5
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 10

In [12]:
from transformers import get_cosine_schedule_with_warmup
reset_seeds(SEED)

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

train_dt = DobaeDataset(qna_df)
train_dl = torch.utils.data.DataLoader(train_dt, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# scheduler 추가
dataset_size = len(train_dt)
num_training_steps_per_epoch = dataset_size // batch_size
num_training_steps = num_training_steps_per_epoch * epochs
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps) # scheduler 추가

for i in tqdm(range(epochs)):
    train_loss = train_loop(train_dl, model, loss_fn, optimizer, scheduler, device) # scheduler 추가
    print(f'{i+1}번째 epoch: {train_loss:.5f}')

    model.save_pretrained(f'{DATA_PATH}model/kogpt2_{qna_df.shape[0]}_{i+1}epoch_{train_loss:.5f}loss')
    if i == 3:
        break

pytorch_model.bin:   0%|          | 0.00/513M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

1번째 epoch: 0.51348


  0%|          | 0/20000 [00:00<?, ?it/s]

2번째 epoch: 0.27966


  0%|          | 0/20000 [00:00<?, ?it/s]

3번째 epoch: 0.21946


  0%|          | 0/20000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
from google.colab import runtime

In [None]:
runtime.unassign()