In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup

##GPU 사용 시
device = torch.device("cuda:0")

In [None]:
torch.__version__

'1.9.0+cu111'

In [None]:
# 학습용 데이터셋 불러오기
import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)

(45654, 3)
(9131, 2)


In [None]:
train = train.iloc[:,1:]

In [None]:
from konlpy.tag import Okt

In [None]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

In [None]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))

In [None]:
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


In [None]:
# 기본 Bert tokenizer 사용
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair) 

#         self.sentences = [transform([i[sent_idx]]) for i in dataset.to_numpy()]
        self.token_ids = []
        self.valid_length =[]
        self.segment_ids = []
        for i in dataset.to_numpy():
            out = transform([i[sent_idx]])
            self.token_ids.append(out[0])
            self.valid_length.append(out[1])
            self.segment_ids.append(out[2])
        self.labels = [np.int32(i[label_idx]) for i in dataset.to_numpy()]
        

    def __getitem__(self, i):
        token_ids = self.token_ids[i]
        valid_length = self.valid_length[i]
        segment_ids = self.segment_ids[i]
        return {
            'token_ids' : torch.tensor(token_ids, dtype=torch.long),
            'valid_length' : torch.tensor(valid_length, dtype=torch.long),
            'segment_ids' : torch.tensor(segment_ids, dtype=torch.long),
            'label' : torch.tensor(self.labels[i], dtype=torch.long)
        }

    def __len__(self):
        return (len(self.labels))

In [None]:
# Setting parameters
max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [None]:
train.head()

Unnamed: 0,title,topic_idx
0,인천 → 핀란드 항공기 결항 휴가 철 여행객 분통,4
1,실리콘밸리 넘어서다 구글 15조원 들이다 美 전역 거점 화,4
2,이란 외무 긴장 완화 해결 책 미국 경제 전쟁 멈추다 것,4
3,NYT 클린턴 측근 韓 기업 특수 관계 조명 공과 사 맞다 물리다 종합,4
4,시진핑 트럼프 중미 무역 협상 조속 타결 희망,4


In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 7, # softmax 사용 <- binary일 경우는 2
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        bert_output = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        pooler = bert_output['pooler_output']
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
      
model = BERTClassifier(bertmodel, dr_rate=0.7).to(device)

In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

In [None]:
# data_train = BERTDataset(train, 0, 1, tok, max_len, True, False)
# data_test = BERTDataset(valid, 0, 1, tok, max_len, True, False)

# # pytorch용 DataLoader 사용
# train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
# test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)


In [None]:

# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  

In [None]:
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장

from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# dirty_mnist_answer에서 train_idx와 val_idx를 생성
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장
for fold_index, (trn_idx, val_idx) in enumerate(kfold.split(train['title'],train['topic_idx']),1):
    
    # cuda cache 초기화
    torch.cuda.empty_cache()
    train_data = train.loc[trn_idx]
    valid_data = train.loc[val_idx]

    data_train = BERTDataset(train_data, 0, 1, tok, max_len, True, False)
    data_test = BERTDataset(valid_data, 0, 1, tok, max_len, True, False)

    # pytorch용 DataLoader 사용
    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)
    bertmodel, _ = get_pytorch_kobert_model()
    model = BERTClassifier(bertmodel, dr_rate=0.7).to(device)
    
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # 옵티마이저 선언
    optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능
   
    #scheduler
    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)
    scaler = torch.cuda.amp.GradScaler()
    valid_acc_max = 0
    # 모델 학습 시작
    model.train()
    
    for e in range(3):
        train_acc = 0.0
        test_acc = 0.0
        tqdm_dataset = tqdm(enumerate(train_dataloader))
        for batch_id, items in tqdm_dataset:
            optimizer.zero_grad()
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            label = items['label'].to(device)
            with torch.cuda.amp.autocast():
                out = model(token_ids, valid_length, segment_ids)
                loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # gradient clipping
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            if batch_id % log_interval == 0:
                print("epoch {} batch id {} loss {} train acc {}".format(e+1, batch_id+1, loss.data.cpu().numpy(), train_acc / (batch_id+1)))
        print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))
        
        
        model.eval() # 평가 모드로 변경
        tqdm_dataset = tqdm(enumerate(test_dataloader))
        for batch_id, items in tqdm_dataset:
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            label = items['label'].to(device)
            with torch.no_grad():
                out = model(token_ids, valid_length, segment_ids)
                test_acc += calc_accuracy(out, label)
        print("epoch {} test acc {}".format(e+1, test_acc / (batch_id+1)))
        
                # 모델 저장
        if valid_acc_max < test_acc:
            valid_acc_max = test_acc
            best_model = model


    # 폴드별로 가장 좋은 모델 저장
    best_models.append(best_model)

using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 2.0061492919921875 train acc 0.1875
epoch 1 batch id 201 loss 0.5755143165588379 train acc 0.5356809701492538
epoch 1 batch id 401 loss 0.05438694357872009 train acc 0.6998519326683291
epoch 1 train acc 0.7538036339754816


0it [00:00, ?it/s]

epoch 1 test acc 0.8706852740282973


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.5779359340667725 train acc 0.84375
epoch 2 batch id 201 loss 0.31804943084716797 train acc 0.857431592039801
epoch 2 batch id 401 loss 0.01813340187072754 train acc 0.8801044264339152
epoch 2 train acc 0.892595227670753


0it [00:00, ?it/s]

epoch 2 test acc 0.8806869003089933


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.25641071796417236 train acc 0.90625
epoch 3 batch id 201 loss 0.19398295879364014 train acc 0.9064054726368159
epoch 3 batch id 401 loss 0.00566469132900238 train acc 0.9252649625935162
epoch 3 train acc 0.9353522227426384


0it [00:00, ?it/s]

epoch 3 test acc 0.882214079525126
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 1.98712158203125 train acc 0.140625
epoch 1 batch id 201 loss 0.6489734649658203 train acc 0.5366138059701493
epoch 1 batch id 401 loss 0.08132278919219971 train acc 0.6982543640897756
epoch 1 train acc 0.7532977131104143


0it [00:00, ?it/s]

epoch 1 test acc 0.8698670515530981


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.39500248432159424 train acc 0.90625
epoch 2 batch id 201 loss 0.5004416704177856 train acc 0.8585976368159204
epoch 2 batch id 401 loss 0.0609494149684906 train acc 0.880143391521197
epoch 2 train acc 0.8941142630228486


0it [00:00, ?it/s]

epoch 2 test acc 0.8641852333712798


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.3072355091571808 train acc 0.90625
epoch 3 batch id 201 loss 0.37536120414733887 train acc 0.9055503731343284
epoch 3 batch id 401 loss 0.01277652382850647 train acc 0.9233167082294265
epoch 3 train acc 0.9349417586445649


0it [00:00, ?it/s]

epoch 3 test acc 0.8759859326719791
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 1.9434280395507812 train acc 0.171875
epoch 1 batch id 201 loss 0.4452838897705078 train acc 0.5318718905472637
epoch 1 batch id 401 loss 0.24816960096359253 train acc 0.694942331670823
epoch 1 train acc 0.750506557243514


0it [00:00, ?it/s]

epoch 1 test acc 0.8773530248820947


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.2268916368484497 train acc 0.9375
epoch 2 batch id 201 loss 0.15333521366119385 train acc 0.8575093283582089
epoch 2 batch id 401 loss 0.009874433279037476 train acc 0.8802992518703242
epoch 2 train acc 0.8943331772084878


0it [00:00, ?it/s]

epoch 2 test acc 0.8827070458611156


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.18833497166633606 train acc 0.9375
epoch 3 batch id 201 loss 0.1568985879421234 train acc 0.9078047263681592
epoch 3 batch id 401 loss 0.005997747182846069 train acc 0.9271352867830424
epoch 3 train acc 0.9368979859894921


0it [00:00, ?it/s]

epoch 3 test acc 0.8826511424621889
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 2.0920639038085938 train acc 0.15625
epoch 1 batch id 201 loss 0.5848350524902344 train acc 0.5513059701492538
epoch 1 batch id 401 loss 0.18772101402282715 train acc 0.7067877182044888
epoch 1 train acc 0.759358581436077


0it [00:00, ?it/s]

epoch 1 test acc 0.8783364164904862


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.2806488275527954 train acc 0.9375
epoch 2 batch id 201 loss 0.31155669689178467 train acc 0.8559546019900498
epoch 2 batch id 401 loss 0.012970715761184692 train acc 0.8820137157107232
epoch 2 train acc 0.8955505691768827


0it [00:00, ?it/s]

epoch 2 test acc 0.8745121157911856


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.2784668803215027 train acc 0.953125
epoch 3 batch id 201 loss 0.16624504327774048 train acc 0.9090485074626866
epoch 3 batch id 401 loss 0.004858300089836121 train acc 0.9270963216957606
epoch 3 train acc 0.937732914511465


0it [00:00, ?it/s]

epoch 3 test acc 0.8718338347698813
using cached model
using cached model


0it [00:00, ?it/s]

epoch 1 batch id 1 loss 1.9492645263671875 train acc 0.171875
epoch 1 batch id 201 loss 0.5073766708374023 train acc 0.4986007462686567
epoch 1 batch id 401 loss 0.18247419595718384 train acc 0.6780704488778054
epoch 1 train acc 0.7392483282916733


0it [00:00, ?it/s]

epoch 1 test acc 0.8760354229104229


0it [00:00, ?it/s]

epoch 2 batch id 1 loss 0.2128760814666748 train acc 0.9375
epoch 2 batch id 201 loss 0.3520752191543579 train acc 0.8566542288557214
epoch 2 batch id 401 loss 0.035223305225372314 train acc 0.8799096009975063
epoch 2 train acc 0.8940057315714058


0it [00:00, ?it/s]

epoch 2 test acc 0.8714462620712621


0it [00:00, ?it/s]

epoch 3 batch id 1 loss 0.1029348373413086 train acc 0.9375
epoch 3 batch id 201 loss 0.20599442720413208 train acc 0.9063277363184079
epoch 3 batch id 401 loss 0.04179313778877258 train acc 0.9261221945137157
epoch 3 train acc 0.9375273642732049


0it [00:00, ?it/s]

epoch 3 test acc 0.8791468947718948


In [None]:
len(best_models)

5

In [None]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
preds = []
#시작 시간 설정
t0 = time.time()


for idx,Best_Model in enumerate(best_models):
    print('#'*10,idx+1,": 번째 모델 예측 진행",'#'*10)
    
    model = Best_Model
    model.eval()
    pred = []
    for step in range(len(test)):
        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step,len(test), elapsed))
        test_sentence = test.title[step]
        test_label = 0


        unseen_test = pd.DataFrame([[test_sentence, test_label]], columns = [['title', 'topic_idx']])
        #unseen_values = unseen_test.values
        test_set = BERTDataset(unseen_test, 0, 1, tok, max_len, True, False)
        test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

        for batch_id, items in enumerate(test_input):
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            #label = items['label'].to(device)
            out = model(token_ids, valid_length, segment_ids)
            pred.append(int(torch.argmax(out).cpu().numpy()))
    preds.append(pred)

    

In [None]:
df = pd.DataFrame(preds).T

In [None]:
df= df.mode(axis=1)[0]

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['topic_idx'] = df
sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
#sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)

In [None]:
import time
import datetime
# 테스트 데이터 예측

#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

pred = []
for step in range(len(test)):
    if step % 500 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step,len(test), elapsed))
    test_sentence = test.title[step]
    test_label = 0


    unseen_test = pd.DataFrame([[test_sentence, test_label]], columns = [['title', 'topic_idx']])
    #unseen_values = unseen_test.values
    test_set = BERTDataset(unseen_test, 0, 1, tok, max_len, True, False)
    test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

    for batch_id, items in enumerate(test_input):
        token_ids = items['token_ids'].to(device)
        segment_ids = items['segment_ids'].to(device)
        valid_length= items['valid_length']
        #label = items['label'].to(device)
        out = model(token_ids, valid_length, segment_ids)
        pred.append(int(torch.argmax(out).cpu().numpy()))

  Batch   500  of  9,131.    Elapsed: 0:00:05.
  Batch 1,000  of  9,131.    Elapsed: 0:00:10.
  Batch 1,500  of  9,131.    Elapsed: 0:00:15.
  Batch 2,000  of  9,131.    Elapsed: 0:00:20.
  Batch 2,500  of  9,131.    Elapsed: 0:00:25.
  Batch 3,000  of  9,131.    Elapsed: 0:00:29.
  Batch 3,500  of  9,131.    Elapsed: 0:00:34.
  Batch 4,000  of  9,131.    Elapsed: 0:00:39.
  Batch 4,500  of  9,131.    Elapsed: 0:00:44.
  Batch 5,000  of  9,131.    Elapsed: 0:00:49.
  Batch 5,500  of  9,131.    Elapsed: 0:00:54.
  Batch 6,000  of  9,131.    Elapsed: 0:00:59.
  Batch 6,500  of  9,131.    Elapsed: 0:01:04.
  Batch 7,000  of  9,131.    Elapsed: 0:01:09.
  Batch 7,500  of  9,131.    Elapsed: 0:01:14.
  Batch 8,000  of  9,131.    Elapsed: 0:01:18.
  Batch 8,500  of  9,131.    Elapsed: 0:01:23.
  Batch 9,000  of  9,131.    Elapsed: 0:01:28.


In [None]:
df

0       0.0
1       3.0
2       2.0
3       0.0
4       3.0
       ... 
9126    3.0
9127    2.0
9128    3.0
9129    2.0
9130    2.0
Name: 0, Length: 9131, dtype: float64

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['topic_idx'] = df
sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)