In [10]:
# 필요한 패키지를 설치
!pip install kogpt2_transformers
!pip install wandb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
# util
from kogpt2_transformers import get_kogpt2_tokenizer
import json

def json_load(data_path = '/opt/ml/data/요약대회/train_summary.json'):
    data = []
    with open(data_path, 'r') as json_file:
            json_list = json.load(json_file)
            
    return json_list


def token_num(data_path = '/opt/ml/data/요약대회/train_summary.json'):
    data = []
    with open(data_path, 'r') as json_file:
            json_list = json.load(json_file)

    gpt_tok = get_kogpt2_tokenizer()
    gpt_tok_num = 0
    count = 0

    for json_str in json_list:
            tmp_str = json_str['summary']
            gpt_tok_num = max(gpt_tok_num, len(gpt_tok.encode(tmp_str, max_length=512, truncation=True)))

    print('max gpt token len:', gpt_tok_num)

# if __name__ == '__main__':
#     token_num()

In [12]:
# model
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data import Dataset
from kogpt2_transformers import get_kogpt2_model


class AbstractiveKoGPT2(nn.Module):
    def __init__(self):
        super(AbstractiveKoGPT2, self).__init__()
        self.kogpt2 = get_kogpt2_model()

    def generate(self,
                 input_ids,
                 do_sample=True,
                 max_length= 60,
                 top_p=0.92,
                 top_k=50,
                 temperature= 0.6,
                 no_repeat_ngram_size =None,
                 num_return_sequences=3,
                 early_stopping=False,
                 ):
        return self.kogpt2.generate(input_ids,
                                     do_sample=do_sample,
                                     max_length=max_length,
                                     top_p = top_p, # 글의 표현 범위 조절
                                     top_k=top_k, # 글의 표현 범위 조절
                                     temperature=temperature, # 글의 창의성 조절
                                     no_repeat_ngram_size= no_repeat_ngram_size,
                                     num_return_sequences=num_return_sequences,
                                     early_stopping = early_stopping,
                                     eos_token_id = 1,
                                     pad_token_id= 3
                                    )

    def forward(self, input, labels = None):
        if labels is not None:
            outputs = self.kogpt2(input, labels=labels)
        else:
            outputs = self.kogpt2(input)

        return outputs

In [13]:
# dataset 
class AbstrativeDataset(Dataset):
    def __init__(self,
                 n_ctx = 1024, 
                 articles_max_length = 810,
                 summary_max_length = 210,
                 device = 'cpu'
                 ):
        self.data =[]
        self.device = device
        self.tokenizer = get_kogpt2_tokenizer()

        bos_token_id = [self.tokenizer.bos_token_id] # <s>, 0
        eos_token_id = [self.tokenizer.eos_token_id] # </s>, 1
        pad_token_id = [self.tokenizer.pad_token_id] # <pad>, 3

        json_datas = json_load()

        for dict_data in tqdm(json_datas):
            articles = dict_data['original']
            abstractive_summary = dict_data['summary']

#             tmp_str =''
#             for article in articles:
#                 tmp_str += article

            # encode
            # truncate, if string exceed max length
            enc_tmp_str = self.tokenizer.encode(articles, truncation= True, max_length=articles_max_length)
            enc_abstractive_summary = self.tokenizer.encode(abstractive_summary, truncation= True, max_length=summary_max_length)
            
            # <s> 요약할 문장 </s> 요약된 문장 </s>
            index_of_words = bos_token_id + enc_tmp_str+ eos_token_id + enc_abstractive_summary + eos_token_id
            pad_token_len = n_ctx - len(index_of_words)
            index_of_words += pad_token_id * pad_token_len
            print(f'max: {max(index_of_words)}, min: {min(index_of_words)}')
            
            # RuntimeError: CUDA error: device-side assert triggered
            if max(index_of_words)<50000:
                self.data.append(torch.tensor(index_of_words).to(device))

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        item = self.data[index]
        return item

# if __name__ == "__main__":
#     dataset = AbstrativeDataset()
#     print(dataset)

In [None]:
# train
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display
from torch.utils.data import dataloader
import random
import wandb

def set_seed(seed = 42):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    
if __name__ == '__main__':
    
    set_seed()
    wandb.init(project= 'bakbak', entity= 'quarter100', name= f'KG')
    checkpoint_path ="/opt/ml/data/요약대회/checkpoint"
    save_ckpt_path = f"{checkpoint_path}/kogpt2-abstractive.pth"

    n_epoch = 5         # Num of Epoch
    batch_size = 4      # 배치 사이즈
    device = "cuda" if torch.cuda.is_available() else "cpu"
    #device = 'cpu'
    print('device: ',device)
    save_step = 100 # 학습 저장 주기
    learning_rate = 5e-5  # Learning Rate

    dataset= AbstrativeDataset(device=device)
    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)
    model = AbstractiveKoGPT2()
    model.to(device)

    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=3)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    total_losses = []
    losses =[]

    if os.path.isfile(save_ckpt_path):
        checkpoint = torch.load(save_ckpt_path, map_location=device)
        pre_epoch = checkpoint['epoch']
        pre_loss = checkpoint['loss']
        total_losses = checkpoint['losses']

        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        print(f"load pretrain from: {save_ckpt_path}, epoch={pre_epoch}, loss={pre_loss}")
        

    for epoch in range(n_epoch):
        count = 0
        with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
            for i, data in enumerate(train_loader):
                optimizer.zero_grad()
#                 for d in data:
#                     print(max(d),min(d))
                outputs = model(data, labels=data)
                _, logits = outputs[:2]

                # Shift so that tokens < n predict n
                shift_logits = logits[..., :-1, :].contiguous()
                shift_labels = data[..., 1:].contiguous()

                loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
                loss.backward()
                optimizer.step()

                losses.append(loss.item())

                if (count > 0 and count % save_step == 0) or (len(data) < batch_size):
                    torch.save({
                        'epoch': epoch,
                        'train_no': count,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss,
                        'losses': losses
                    }, save_ckpt_path)
                count += 1
                pbar.update(1)
                pbar.set_postfix_str(f"Loss: {loss.item():.3f} ({np.mean(losses):.3f})")
                if i%50==0:
                    wandb.log({'loss':np.mean(losses)})

        total_losses.append(np.mean(losses))

    # data
    data = {
        "loss": total_losses
    }
    df = pd.DataFrame(data)
    display(df)

    # graph
    plt.figure(figsize=[12, 4])
    plt.plot(losses, label="loss")
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.show()

In [14]:
class EvalAbstrativeDataset(Dataset):
    def __init__(self,
                 device,
                 tokenizer,
                 n_ctx = 1024,
                 data_path='/opt/ml/data/요약대회/test_summary.json',
                 articles_max_length = 810,
                 summary_max_length = 210,
                 ):
        self.data =[]
        self.tokenizer = tokenizer
        print(device)
        
        bos_token_id = [self.tokenizer.bos_token_id] # <s>
        eos_token_id = [self.tokenizer.eos_token_id] # </s>

        json_datas = json_load(data_path=data_path)
        
        for dict_data in tqdm(json_datas):
            #id = dict_data['id']
            articles = dict_data['original']

            tmp_str =''
            for article in articles:
                tmp_str += article

            # encode
            # truncate, if string exceed max length
            enc_tmp_str = self.tokenizer.encode(tmp_str, truncation= True, max_length=articles_max_length)

            # <s> 요약할 문장 </s> 요약된 문장 </s>
            index_of_words = bos_token_id + enc_tmp_str+ eos_token_id

            self.data.append({
                #'id':id,
                'input':torch.tensor([index_of_words]).to(device)
            })

    def __len__(self):
        return len(self.data)

    def __getitem__(self,index):
        item = self.data[index]
        return item

In [17]:
# test

tokenizer = get_kogpt2_tokenizer()
ckpt_path = '/opt/ml/data/요약대회/checkpoint/kogpt2-abstractive.pth'
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

eval_datas = EvalAbstrativeDataset(tokenizer=tokenizer,device=device,data_path='/opt/ml/data/요약대회/test_summary.json')

checkpoint = torch.load(ckpt_path,map_location=torch.device(device))
model = AbstractiveKoGPT2()
model.to(device)
model.load_state_dict(checkpoint['model_state_dict'])

model.eval()

my_summaries=[]
# for data in tqdm(eval_datas[:3337]):
#     input_ids = data['input']
#     sample_output = model.generate(input_ids=input_ids,max_length=1024)
#     summary = tokenizer.decode(sample_output[0].tolist()[len(input_ids[0]):-1])
#     summary = summary.replace('</s>','').replace('<pad>','')
#     my_summaries.append(summary)

for data in tqdm(eval_datas[3337:]):
    input_ids = data['input']
    max_toc = max(input_ids[0])
    min_toc = min(input_ids[0])
    if max_toc<50000:
        sample_output = model.generate(input_ids=input_ids,max_length=1024)
        summary = tokenizer.decode(sample_output[0].tolist()[len(input_ids[0]):-1])
        summary = summary.replace('</s>','').replace('<pad>','')
        my_summaries.append(summary)
    else:
        summary = tokenizer.decode(input_ids[0])
        summary = summary.replace('<s>','').replace('</s>','').replace('<pad>','')
        my_summaries.append(summary)
    

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


cuda
cuda


100%|██████████| 4640/4640 [00:07<00:00, 646.64it/s]
100%|██████████| 1303/1303 [32:39<00:00,  1.50s/it]


In [18]:
# 배열 list작성
with open('my_summaries_kg_base.json', 'w', encoding="UTF-8") as file:
    json.dump(my_summaries, file, indent='\t', ensure_ascii=False)

In [23]:
with open('my_summaries_kg_base.json', 'r', encoding="UTF-8") as file:
    data = json.load(file)

In [25]:
len(data)

4640

In [26]:
# 제출 형식 담긴 파일
with open('test_summary.json', 'r', encoding="UTF-8") as file:
    test_file = json.load(file)

In [32]:
for test,summary in zip(test_file,data):
    test['summary'] = summary

In [36]:
# 제출용 파일
with open('submit_kg_base.json', 'w', encoding="UTF-8") as file:
    json.dump(test_file, file, indent='\t', ensure_ascii=False)

In [35]:
len(summary)

238