## 라이브러리 로딩

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW

##GPU 사용 시
device = torch.device("cuda:1")

In [5]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [6]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  
    torch.backends.cudnn.deterministic = True  
    torch.backends.cudnn.benchmark = True  
seed_everything()

In [7]:
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)


(45654, 3)
(9131, 2)


In [8]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

## 텍스트 데이터 전처리

In [9]:
from konlpy.tag import Okt
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))
test['title'] = test['title'].apply(lambda x : func(x))

## 모델링

In [10]:
from transformers import AutoTokenizer ,AutoModelForSequenceClassification

#model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tok = AutoTokenizer.from_pretrained("klue/roberta-large")
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-large',num_labels=7).cuda()

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

In [11]:
class koelectradataset(Dataset):  
    def __init__(self, dataset,max_len,bert_tokenizer):
        
        self.tokenizer = bert_tokenizer
        self.dataset = dataset
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, :2].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=max_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [12]:

# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  

In [None]:
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장

from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)

# dirty_mnist_answer에서 train_idx와 val_idx를 생성
best_models = [] # 폴드별로 가장 validation acc가 높은 모델 저장
for fold_index, (trn_idx, val_idx) in enumerate(kfold.split(train['title'],train['topic_idx']),1):
    
    # cuda cache 초기화
    torch.cuda.empty_cache()
    train_data = train.loc[trn_idx]
    valid_data = train.loc[val_idx]
    
    max_len = 32
    batch_size = 64 
    num_epochs = 10
    warmup_ratio = 0.1
    num_epochs = 5
    max_grad_norm = 1
    log_interval = 200
    learning_rate = 5e-5
    
    train_dataset = koelectradataset(train_data,max_len,tok)
    test_dataset = koelectradataset(valid_data,max_len,tok)

    model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-large',num_labels=7).cuda()




    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    # 옵티마이저 선언
    optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

    losses = []
    accuracies = []
    model.train()
    valid_acc_max = 0
    for i in range(num_epochs):
        print('#'*30,i+1,'epoch start','#'*30)
        total_loss = 0.0
        correct = 0
        total = 0
        batches = 0

        model.train()

        for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
            optimizer.zero_grad()
            y_batch = y_batch.cuda()
            y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
            loss = F.cross_entropy(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            predicted = torch.argmax(y_pred, 1)
            correct += (predicted == y_batch).sum()
            total += len(y_batch)

            batches += 1
            if batches % 100 == 0:
                print("Batch Loss:", total_loss, "Accuracy:", correct.cpu().numpy() / total)

        losses.append(total_loss)
        accuracies.append(correct.cpu().numpy() / total)
        print("Train Loss:", sum(losses) / len(losses), "Train Accuracy:", sum(accuracies) / len(accuracies))

        model.eval()

        test_correct = 0
        test_total = 0

        for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
            y_batch = y_batch.cuda()
            y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
            predicted = torch.argmax(y_pred, 1)
            test_correct += (predicted == y_batch).sum()
            test_total += len(y_batch)
        test_acc = test_correct.cpu().numpy() / test_total
        print("valid  Accuracy:",test_acc )
        print()
        print()
                # 모델 저장
        if valid_acc_max < test_acc:
            valid_acc_max = test_acc
            best_model = model


    # 폴드별로 가장 좋은 모델 저장
    best_models.append(best_model)



Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

############################## 1 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 64.02468879520893 Accuracy: 0.7959375
Batch Loss: 108.77728089690208 Accuracy: 0.828828125
Batch Loss: 149.7535834312439 Accuracy: 0.8419270833333333
Batch Loss: 189.14376704394817 Accuracy: 0.848515625
Train Loss: 219.60538905858994 Train Accuracy: 0.8508673938756736


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8655539492706006


############################## 2 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 30.866082824766636 Accuracy: 0.89953125
Batch Loss: 61.843560107052326 Accuracy: 0.899140625
Batch Loss: 92.41992459446192 Accuracy: 0.8990104166666667
Batch Loss: 123.21880756318569 Accuracy: 0.8984765625
Train Loss: 183.7776500955224 Train Accuracy: 0.8740636088842161


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8740964647128401


############################## 3 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 19.56812298670411 Accuracy: 0.93671875
Batch Loss: 42.760465770959854 Accuracy: 0.9303125
Batch Loss: 68.19061571359634 Accuracy: 0.9236979166666667
Batch Loss: 90.77722203731537 Accuracy: 0.923359375
Train Loss: 159.0399935543537 Train Accuracy: 0.8899220221667324


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8685766855040085


############################## 4 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 20.84789026528597 Accuracy: 0.93375
Batch Loss: 46.123411886394024 Accuracy: 0.927890625
Batch Loss: 70.66186838597059 Accuracy: 0.924375
Batch Loss: 95.43429301679134 Accuracy: 0.9237109375
Train Loss: 147.71613958850503 Train Accuracy: 0.8983851360231305


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8636483112104087


############################## 5 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 16.849375542253256 Accuracy: 0.9459375
Batch Loss: 34.13037687726319 Accuracy: 0.94640625
Batch Loss: 52.39595231972635 Accuracy: 0.9459895833333334
Batch Loss: 68.91681771166623 Accuracy: 0.9462890625
Train Loss: 137.95186520554125 Train Accuracy: 0.9059534761466684


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8372979366539625




Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

############################## 1 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 196.15365481376648 Accuracy: 0.1603125
Batch Loss: 391.5454224348068 Accuracy: 0.161953125
Batch Loss: 586.2408883571625 Accuracy: 0.16302083333333334
Batch Loss: 781.2093926668167 Accuracy: 0.162265625
Train Loss: 929.4725106954575 Train Accuracy: 0.15912077802602181


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.1612564068865817


############################## 2 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 194.75771057605743 Accuracy: 0.1653125
Batch Loss: 389.025763630867 Accuracy: 0.166875
Batch Loss: 583.8905665874481 Accuracy: 0.16270833333333334
Batch Loss: 778.3510016202927 Accuracy: 0.1609375
Train Loss: 927.7241632938385 Train Accuracy: 0.16002431331318176


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.1671047443816533


############################## 3 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 194.25445699691772 Accuracy: 0.17015625
Batch Loss: 388.54089307785034 Accuracy: 0.16078125
Batch Loss: 582.9396605491638 Accuracy: 0.15786458333333334
Batch Loss: 777.335245013237 Accuracy: 0.1600390625
Train Loss: 926.8247445027033 Train Accuracy: 0.16000788539886976


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.1671047443816533


############################## 4 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 194.39227545261383 Accuracy: 0.158125
Batch Loss: 389.0204463005066 Accuracy: 0.155
Batch Loss: 583.2901132106781 Accuracy: 0.1553125
Batch Loss: 777.4728757143021 Accuracy: 0.158046875
Train Loss: 926.3190554082394 Train Accuracy: 0.16000788539886976


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.1671047443816533


############################## 5 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 194.18914270401 Accuracy: 0.160625
Batch Loss: 388.53126060962677 Accuracy: 0.158359375
Batch Loss: 582.598895072937 Accuracy: 0.16010416666666666
Batch Loss: 776.790965795517 Accuracy: 0.162265625
Train Loss: 925.9321939945221 Train Accuracy: 0.16001445656459456


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.15185963990011828




Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifi

############################## 1 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 61.675462767481804 Accuracy: 0.803125
Batch Loss: 106.65061494708061 Accuracy: 0.830625
Batch Loss: 147.9585588723421 Accuracy: 0.8406770833333334
Batch Loss: 188.59805412590504 Accuracy: 0.8475390625
Train Loss: 217.57528260350227 Train Accuracy: 0.8517545012485215


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.8664739124720725


############################## 2 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

Batch Loss: 30.314347498118877 Accuracy: 0.9021875
Batch Loss: 60.11511919647455 Accuracy: 0.900546875
Batch Loss: 225.60364639014006 Accuracy: 0.7050520833333334
Batch Loss: 420.5855315849185 Accuracy: 0.5671875
Train Loss: 393.5549966804683 Train Accuracy: 0.6754501248521487


  0%|          | 0/238 [00:00<?, ?it/s]

valid  Accuracy: 0.15185963990011828


############################## 3 epoch start ##############################


  0%|          | 0/476 [00:00<?, ?it/s]

In [None]:
for idx,model in enumerate(best_models):
    torch.save(model.state_dict(), 'model/koelectra_'+str(idx))

## 추론

In [None]:
from transformers import ElectraForSequenceClassification, ElectraTokenizer

#model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tok = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels = 7).cuda()

In [None]:
best_model = []
model = model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels = 7).cuda()
for i in range(3):
    model.load_state_dict(torch.load('model/koelectra_'+str(i)))
    best_model.append(model)
    

In [13]:
len(best_models)

5

In [14]:
max_len = 32
batch_size = 64
num_epochs = 10
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

test['pred'] = 0
test_dataset = koelectradataset(test,max_len,tok)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [15]:
# 평가모드로 변경
preds = []
for idx,Best_Model in enumerate(best_models):
    print('#'*10,idx+1,": 번째 모델 예측 진행",'#'*10)
    model = Best_Model
    model.eval()
    
    pred = []
    for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
        y_batch = y_batch.cuda()
        y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
        #predicted = torch.argmax(y_pred, 1)
        #test_correct += (predicted == y_batch).sum()
        #test_total += len(y_batch)
        pred.extend(y_pred.cpu().detach().numpy().tolist())
    preds.append(pred)

########## 1 : 번째 모델 예측 진행 ##########


  0%|          | 0/9131 [00:00<?, ?it/s]

########## 2 : 번째 모델 예측 진행 ##########


  0%|          | 0/9131 [00:00<?, ?it/s]

########## 3 : 번째 모델 예측 진행 ##########


  0%|          | 0/9131 [00:00<?, ?it/s]

########## 4 : 번째 모델 예측 진행 ##########


  0%|          | 0/9131 [00:00<?, ?it/s]

########## 5 : 번째 모델 예측 진행 ##########


  0%|          | 0/9131 [00:00<?, ?it/s]

In [16]:
new_pred = []
for i in range(len(preds[0])):
    a = np.array(preds[0][i])
    b = np.array(preds[1][i])
    c = np.array(preds[2][i])
    d = np.array(preds[3][i])
    e = np.array(preds[4][i])

    
    new_pred.append(a+b+c+d+e)
    
len(new_pred)

9131

In [17]:
pred = np.argmax(new_pred,1)

In [92]:
pred

array([0, 3, 2, ..., 2, 2, 2])

In [18]:
sub = pd.read_csv('sample_submission.csv')
sub.topic_idx = pred