## 라이브러리 로딩

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup
import random

##GPU 사용 시
device = torch.device("cuda:0")

In [4]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [5]:
torch.__version__

'1.9.0+cu111'

In [6]:
# 학습용 데이터셋 불러오기
import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)

(45654, 3)
(9131, 2)


In [7]:
train

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...,...
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2


In [8]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

In [9]:
from konlpy.tag import Okt

# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))
test['title'] = test['title'].apply(lambda x : func(x))

In [12]:
bertmodel, vocab = get_pytorch_kobert_model()

# 기본 Bert tokenizer 사용
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model
using cached model
using cached model


## 커스텀 데이터셋

In [13]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) 

#         self.sentences = [transform([i[sent_idx]]) for i in dataset.to_numpy()]
        self.token_ids = []
        self.valid_length =[]
        self.segment_ids = []
        for i in dataset.to_numpy():
            out = transform([i[sent_idx]])
            self.token_ids.append(out[0])
            self.valid_length.append(out[1])
            self.segment_ids.append(out[2])
        self.labels = [np.int32(i[label_idx]) for i in dataset.to_numpy()]
        

    def __getitem__(self, i):
        token_ids = self.token_ids[i]
        valid_length = self.valid_length[i]
        segment_ids = self.segment_ids[i]
        return {
            'token_ids' : torch.tensor(token_ids, dtype=torch.long),
            'valid_length' : torch.tensor(valid_length, dtype=torch.long),
            'segment_ids' : torch.tensor(segment_ids, dtype=torch.long),
            'label' : torch.tensor(self.labels[i], dtype=torch.long)
        }

    def __len__(self):
        return (len(self.labels))

In [14]:
# Setting parameters
max_len = 32 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

## 커스텀 모델 정의


In [15]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 7, # softmax 사용 <- binary일 경우는 2
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        bert_output = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        pooler = bert_output['pooler_output']
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
      
#model = BERTClassifier(bertmodel, dr_rate=0.7).cuda()

In [18]:

# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  

## k-fold 학습

In [20]:
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장

from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# dirty_mnist_answer에서 train_idx와 val_idx를 생성
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장
for fold_index, (trn_idx, val_idx) in enumerate(kfold.split(train['title'],train['topic_idx'])):
    
    # cuda cache 초기화
    torch.cuda.empty_cache()
    train_data = train.loc[trn_idx]
    valid_data = train.loc[val_idx]

    data_train = BERTDataset(train_data, 0, 1, tok, max_len, True, False)
    data_test = BERTDataset(valid_data, 0, 1, tok, max_len, True, False)

    # pytorch용 DataLoader 사용
    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
    bertmodel, _ = get_pytorch_kobert_model()
    model = BERTClassifier(bertmodel, dr_rate=0.7).to(device)
    
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # 옵티마이저 선언
    optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능
   
    #scheduler
    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    scaler = torch.cuda.amp.GradScaler()
    valid_acc_max = 0
    # 모델 학습 시작
    model.train()
    
    for e in range(5):
        
        train_acc = 0.0
        test_acc = 0.0
        tqdm_dataset = tqdm(enumerate(train_dataloader))
        for batch_id, items in tqdm_dataset:
            optimizer.zero_grad()
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            label = items['label'].to(device)
            with torch.cuda.amp.autocast():
                out = model(token_ids, valid_length, segment_ids)
                loss = loss_fn(out, label)
            #loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # gradient clipping
            #optimizer.step()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
        print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
        
        
        model.eval() # 평가 모드로 변경
        tqdm_dataset = tqdm(enumerate(test_dataloader))
        for batch_id, items in tqdm_dataset:
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            label = items['label'].to(device)
            with torch.no_grad():
                out = model(token_ids, valid_length, segment_ids)
                test_acc += calc_accuracy(out, label)
        print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
        
                # 모델 저장
        if valid_acc_max < test_acc:
            valid_acc_max = test_acc
            best_model = model


    # 폴드별로 가장 좋은 모델 저장
    best_models.append(best_model)
    

using cached model
using cached model


0it [00:00, ?it/s]



epoch 1 batch id 1 loss 1.8629302978515625 train acc 0.265625
epoch 1 batch id 201 loss 0.6996049880981445 train acc 0.49300373134328357
epoch 1 batch id 401 loss 0.7996517419815063 train acc 0.6878117206982544
epoch 1 train acc 0.7144826680672269


0it [00:00, ?it/s]

epoch 1 test acc 0.8682011554621849


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.3858485221862793 train acc 0.890625
epoch 2 batch id 201 loss 0.45624542236328125 train acc 0.8577425373134329
epoch 2 batch id 401 loss 0.5289002656936646 train acc 0.8947552992518704
epoch 2 train acc 0.8972630718954249


0it [00:00, ?it/s]

epoch 2 test acc 0.8720273109243697


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.20861142873764038 train acc 0.90625
epoch 3 batch id 201 loss 0.36656737327575684 train acc 0.9071828358208955
epoch 3 batch id 401 loss 0.3397447466850281 train acc 0.934265897755611
epoch 3 train acc 0.9378939075630253


0it [00:00, ?it/s]

epoch 3 test acc 0.8733403361344537


0it [00:00, ?it/s]

epoch 4 batch id 1 loss 0.1121591329574585 train acc 0.984375
epoch 4 batch id 201 loss 0.21358102560043335 train acc 0.9469060945273632
epoch 4 batch id 401 loss 0.21704596281051636 train acc 0.9622428304239401
epoch 4 train acc 0.9654674369747899


0it [00:00, ?it/s]

epoch 4 test acc 0.8773923319327731


0it [00:00, ?it/s]

epoch 5 batch id 1 loss 0.06625470519065857 train acc 0.984375
epoch 5 batch id 201 loss 0.1298101544380188 train acc 0.9698383084577115
epoch 5 batch id 401 loss 0.08744862675666809 train acc 0.9796991895261845
epoch 5 train acc 0.9816504726890757


0it [00:00, ?it/s]

epoch 5 test acc 0.8791465336134454
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 1.9587860107421875 train acc 0.1875
epoch 1 batch id 201 loss 0.6254019737243652 train acc 0.5329601990049752
epoch 1 batch id 401 loss 0.4601551294326782 train acc 0.706553927680798
epoch 1 train acc 0.7280797735760971


0it [00:00, ?it/s]

epoch 1 test acc 0.8653965336134454


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.3254120349884033 train acc 0.921875
epoch 2 batch id 201 loss 0.42282068729400635 train acc 0.8579757462686567
epoch 2 batch id 401 loss 0.3464636206626892 train acc 0.8931187655860349
epoch 2 train acc 0.895844275210084


0it [00:00, ?it/s]

epoch 2 test acc 0.8847452731092438


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.19351744651794434 train acc 0.953125
epoch 3 batch id 201 loss 0.21963518857955933 train acc 0.910214552238806
epoch 3 batch id 401 loss 0.22846311330795288 train acc 0.9343827930174564
epoch 3 train acc 0.9375656512605042


0it [00:00, ?it/s]

epoch 3 test acc 0.8808902310924369


0it [00:00, ?it/s]

epoch 4 batch id 1 loss 0.11253446340560913 train acc 0.984375
epoch 4 batch id 201 loss 0.09374964237213135 train acc 0.9520366915422885
epoch 4 batch id 401 loss 0.053571879863739014 train acc 0.9660614089775561
epoch 4 train acc 0.9686515231092437


0it [00:00, ?it/s]

epoch 4 test acc 0.8814154411764705


0it [00:00, ?it/s]

epoch 5 batch id 1 loss 0.09038379788398743 train acc 0.984375
epoch 5 batch id 201 loss 0.032305121421813965 train acc 0.9762126865671642
epoch 5 batch id 401 loss 0.04237523674964905 train acc 0.9831281172069826
epoch 5 train acc 0.984375


0it [00:00, ?it/s]

epoch 5 test acc 0.883844537815126
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 2.1174163818359375 train acc 0.0625
epoch 1 batch id 201 loss 0.6084966659545898 train acc 0.5509950248756219
epoch 1 batch id 401 loss 0.96811842918396 train acc 0.7176979426433915
epoch 1 train acc 0.7392733134920635


0it [00:00, ?it/s]

epoch 1 test acc 0.8693172268907563


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.38549482822418213 train acc 0.90625
epoch 2 batch id 201 loss 0.4073885679244995 train acc 0.8570429104477612
epoch 2 batch id 401 loss 0.5986412167549133 train acc 0.8935084164588528
epoch 2 train acc 0.8960084033613446


0it [00:00, ?it/s]

epoch 2 test acc 0.8749448529411764


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.18147116899490356 train acc 0.953125
epoch 3 batch id 201 loss 0.2666902542114258 train acc 0.9085820895522388
epoch 3 batch id 401 loss 0.24250313639640808 train acc 0.9353569201995012
epoch 3 train acc 0.9383534663865546


0it [00:00, ?it/s]

epoch 3 test acc 0.8750288865546219


0it [00:00, ?it/s]

epoch 4 batch id 1 loss 0.035235315561294556 train acc 1.0
epoch 4 batch id 201 loss 0.18514370918273926 train acc 0.9497046019900498
epoch 4 batch id 401 loss 0.0731431245803833 train acc 0.965126246882793
epoch 4 train acc 0.9680278361344538


0it [00:00, ?it/s]

epoch 4 test acc 0.8750577731092436


0it [00:00, ?it/s]

epoch 5 batch id 1 loss 0.02078711986541748 train acc 1.0
epoch 5 batch id 201 loss 0.12366640567779541 train acc 0.9736473880597015
epoch 5 batch id 401 loss 0.056561797857284546 train acc 0.9818812344139651
epoch 5 train acc 0.983390231092437


0it [00:00, ?it/s]

epoch 5 test acc 0.8790519957983193


In [21]:
len(best_models)

3

## inference

In [22]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [23]:
preds = [[0,0,0,0,0,0,0]]*9131
#시작 시간 설정
t0 = time.time()


for idx,Best_Model in enumerate(best_models):
    print('#'*10,idx+1,": 번째 모델 예측 진행",'#'*10)
    
    model = Best_Model
    model.eval()
    pred = []
    for step in range(len(test)):
        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step,len(test), elapsed))
        test_sentence = test.title[step]
        test_label = 0


        unseen_test = pd.DataFrame([[test_sentence, test_label]], columns = [['title', 'topic_idx']])
        #unseen_values = unseen_test.values
        test_set = BERTDataset(unseen_test, 0, 1, tok, max_len, True, False)
        test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

        for batch_id, items in enumerate(test_input):
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            #label = items['label'].to(device)
            out = model(token_ids, valid_length, segment_ids)
            preds[step] =  [x+y for x,y in zip(preds[step],out.cpu().tolist()[0])]
    #preds.append(pred)

    

########## 1 : 번째 모델 예측 진행 ##########
  Batch 1,000  of  9,131.    Elapsed: 0:01:25.
  Batch 2,000  of  9,131.    Elapsed: 0:02:56.
  Batch 3,000  of  9,131.    Elapsed: 0:04:23.
  Batch 4,000  of  9,131.    Elapsed: 0:04:37.
  Batch 5,000  of  9,131.    Elapsed: 0:04:47.
  Batch 6,000  of  9,131.    Elapsed: 0:04:57.
  Batch 7,000  of  9,131.    Elapsed: 0:05:08.
  Batch 8,000  of  9,131.    Elapsed: 0:05:18.
  Batch 9,000  of  9,131.    Elapsed: 0:05:28.
########## 2 : 번째 모델 예측 진행 ##########
  Batch 1,000  of  9,131.    Elapsed: 0:05:40.
  Batch 2,000  of  9,131.    Elapsed: 0:05:50.
  Batch 3,000  of  9,131.    Elapsed: 0:06:00.
  Batch 4,000  of  9,131.    Elapsed: 0:06:11.
  Batch 5,000  of  9,131.    Elapsed: 0:06:21.
  Batch 6,000  of  9,131.    Elapsed: 0:06:31.
  Batch 7,000  of  9,131.    Elapsed: 0:06:42.
  Batch 8,000  of  9,131.    Elapsed: 0:06:52.
  Batch 9,000  of  9,131.    Elapsed: 0:07:02.
########## 3 : 번째 모델 예측 진행 ##########
  Batch 1,000  of  9,131.    Elapsed: 0:

In [24]:
df = pd.DataFrame(preds).T
#df= df.mean(axis=1)

In [25]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9121,9122,9123,9124,9125,9126,9127,9128,9129,9130
0,10.667165,-3.866979,4.488541,11.860032,-3.718804,-2.755945,-4.444994,-3.75389,0.614055,-4.164204,...,16.427015,-3.262003,-4.897614,-2.985478,-6.104986,-4.418587,-5.863899,-5.362651,-1.555378,-0.616131
1,-2.521968,-6.094561,-3.29417,-4.449213,-4.559633,-6.880569,-3.786137,-5.41899,-3.187023,7.983979,...,-1.08739,-3.76072,-0.886129,-5.732688,5.409344,-5.515036,-5.305467,-5.347836,-1.777936,-2.531429
2,6.286655,2.031129,14.032648,7.899144,-0.005563,7.502763,-4.334594,2.862215,0.082223,-4.05586,...,2.513294,17.852494,18.372711,1.687749,16.000155,4.139222,16.646433,12.45272,16.883722,14.179807
3,3.363702,18.977965,-3.174287,-3.796049,19.356687,15.504914,-3.385758,18.690104,-4.388184,-5.492827,...,-4.047382,0.16745,-1.350195,19.29446,-5.450295,18.915262,-3.887768,12.494433,-0.161193,-5.725516
4,-4.91949,-3.888075,-7.066977,-4.937459,-3.478719,-4.091285,0.48604,-5.366935,16.613049,13.783195,...,-5.423872,-5.405681,-4.487484,-3.280332,-5.767435,-4.49591,0.788115,-2.974014,-6.081923,-6.096978
5,-6.348941,-3.846668,-7.934662,-5.334093,-2.971946,-4.157031,20.223135,-3.670496,-5.752652,-4.497931,...,-5.376342,-4.479762,-6.989762,-3.768748,-5.790782,-4.000992,-5.828168,-6.073047,-5.866058,-6.074246
6,-7.730973,-2.11552,1.842198,-2.378694,-3.727029,-4.712382,-1.446084,-2.423363,-3.561395,-3.139828,...,-4.517046,-0.809342,0.469748,-4.331096,0.697267,-3.80029,4.088277,-5.138861,-2.363933,6.038428


In [26]:
pred = np.argmax(preds,1)

In [27]:
sub = pd.read_csv('sample_submission.csv')
sub['topic_idx'] = pred
#sub.to_csv('3fold.csv',index = False)