In [179]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"

In [180]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW

##GPU 사용 시
device = torch.device("cuda:0")

In [195]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [196]:
torch.__version__

'1.9.0+cu111'

In [197]:
# 학습용 데이터셋 불러오기
import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)

(45654, 3)
(9131, 2)


In [198]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

In [192]:
from konlpy.tag import Okt

In [193]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

In [223]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

#train['title'] = train['title'].apply(lambda x : func(x))
test['title'] = test['title'].apply(lambda x : func(x))

In [None]:
#bertmodel, vocab = get_pytorch_kobert_model()

In [199]:
# Train / Test set 분리
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train, test_size=0.2, random_state=42)
print("train shape is:", len(train))
print("valid shape is:", len(valid))

train shape is: 36523
valid shape is: 9131


In [200]:
from transformers import ElectraModel, ElectraTokenizer

#model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tok = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")

In [203]:
class koelectradataset(Dataset):  
    def __init__(self, dataset,max_len,bert_tokenizer):
        
        self.tokenizer = bert_tokenizer
        self.dataset = dataset
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, :2].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=max_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [204]:
max_len = 64
train_dataset = koelectradataset(train,max_len,tok)
test_dataset = koelectradataset(valid,max_len,tok)

In [205]:
model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-v3-discriminator",num_labels  =7).to(device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [206]:
epochs = 5
batch_size = 128
learning_rate = 1e-5

In [209]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)

In [210]:

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [211]:
losses = []
accuracies = []
model.train()
for i in range(epochs):
    print('#'*30,i+1,'epoch start','#'*30)
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted = torch.argmax(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.cpu().numpy() / total)

    losses.append(total_loss)
    accuracies.append(correct.cpu().numpy() / total)
    print("Train Loss:", sum(losses) / len(losses), "Train Accuracy:", sum(accuracies) / len(accuracies))
    
    model.eval()

    test_correct = 0
    test_total = 0

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        predicted = torch.argmax(y_pred, 1)
        test_correct += (predicted == y_batch).sum()
        test_total += len(y_batch)

    print("valid  Accuracy:", test_correct.cpu().numpy() / test_total)
    print()
    print()
    

############################## 1 epoch start ##############################


  0%|          | 0/286 [00:00<?, ?it/s]



Batch Loss: 177.07592499256134 Accuracy: 0.36375
Batch Loss: 284.4376451969147 Accuracy: 0.5587109375
Train Loss: 339.235979616642 Train Accuracy: 0.6453741477972784


  0%|          | 0/72 [00:00<?, ?it/s]

valid  Accuracy: 0.8679224619428321


############################## 2 epoch start ##############################


  0%|          | 0/286 [00:00<?, ?it/s]

Batch Loss: 49.173006534576416 Accuracy: 0.869921875
Batch Loss: 93.21230611205101 Accuracy: 0.8713671875
Train Loss: 233.59542471170425 Train Accuracy: 0.7595898474933603


  0%|          | 0/72 [00:00<?, ?it/s]

valid  Accuracy: 0.8774504435439711


############################## 3 epoch start ##############################


  0%|          | 0/286 [00:00<?, ?it/s]

Batch Loss: 37.18137355148792 Accuracy: 0.886953125
Batch Loss: 73.28861501812935 Accuracy: 0.888984375
Train Loss: 192.48561105132103 Train Accuracy: 0.8004088747729741


  0%|          | 0/72 [00:00<?, ?it/s]

valid  Accuracy: 0.8636512977768043


############################## 4 epoch start ##############################


  0%|          | 0/286 [00:00<?, ?it/s]

Batch Loss: 37.954682528972626 Accuracy: 0.881640625
Batch Loss: 74.79807116091251 Accuracy: 0.8821484375
Train Loss: 171.0097978375852 Train Accuracy: 0.8206267283629494


  0%|          | 0/72 [00:00<?, ?it/s]

valid  Accuracy: 0.882488226919286


############################## 5 epoch start ##############################


  0%|          | 0/286 [00:00<?, ?it/s]

Batch Loss: 32.332585245370865 Accuracy: 0.897578125
Batch Loss: 66.58066669106483 Accuracy: 0.89359375
Train Loss: 155.78937041461467 Train Accuracy: 0.8352380691618979


  0%|          | 0/72 [00:00<?, ?it/s]

valid  Accuracy: 0.8836929142481655




In [224]:
test['pred'] = 0
test_dataset = koelectradataset(test,max_len,tok)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [168]:
test

Unnamed: 0,title,pred
0,유튜브 내달 2일까지 크리에이터 지원 공간 운영,0
1,어버이날 맑다가 흐려져…남부지방 옅은 황사,0
2,내년부터 국가RD 평가 때 논문건수는 반영 않는다,0
3,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,0
4,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,0
...,...,...
9126,인천 오후 3시35분 대설주의보…눈 3.1cm 쌓여,0
9127,노래방에서 지인 성추행 외교부 사무관 불구속 입건종합,0
9128,40년 전 부마항쟁 부산 시위 사진 2점 최초 공개,0
9129,게시판 아리랑TV 아프리카개발은행 총회 개회식 생중계,0


In [225]:
# 평가모드로 변경
model.eval()

pred = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    predicted = torch.argmax(y_pred, 1)
    #test_correct += (predicted == y_batch).sum()
    #test_total += len(y_batch)
    pred.extend(predicted.cpu().numpy())

  0%|          | 0/143 [00:00<?, ?it/s]



In [226]:
len(pred)

9131

In [None]:
# Setting parameters
max_len = 64 # 해당 길이를 초과하는 단어에 대해선 bert가 학습하지 않음
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [None]:
train.head()

In [71]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes = 7, # softmax 사용 <- binary일 경우는 2
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        bert_output = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        pooler = bert_output['pooler_output']
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)
      
model = BERTClassifier(model, dr_rate=0.7).to(device)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)
loss_fn = nn.CrossEntropyLoss() # softmax용 Loss Function 정하기 <- binary classification도 해당 loss function 사용 가능

In [None]:
# data_train = BERTDataset(train, 0, 1, tok, max_len, True, False)
# data_test = BERTDataset(valid, 0, 1, tok, max_len, True, False)

# # pytorch용 DataLoader 사용
# train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
# test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)


In [None]:

# 학습 평가 지표인 accuracy 계산 -> 얼마나 타겟값을 많이 맞추었는가
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc
  

In [None]:
len(best_models)

In [None]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):

    # 반올림
    elapsed_rounded = int(round((elapsed)))
    
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
preds = []
#시작 시간 설정
t0 = time.time()


for idx,Best_Model in enumerate(best_models):
    print('#'*10,idx+1,": 번째 모델 예측 진행",'#'*10)
    
    model = Best_Model
    model.eval()
    pred = []
    for step in range(len(test)):
        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step,len(test), elapsed))
        test_sentence = test.title[step]
        test_label = 0


        unseen_test = pd.DataFrame([[test_sentence, test_label]], columns = [['title', 'topic_idx']])
        #unseen_values = unseen_test.values
        test_set = BERTDataset(unseen_test, 0, 1, tok, max_len, True, False)
        test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

        for batch_id, items in enumerate(test_input):
            token_ids = items['token_ids'].to(device)
            segment_ids = items['segment_ids'].to(device)
            valid_length= items['valid_length']
            #label = items['label'].to(device)
            out = model(token_ids, valid_length, segment_ids)
            pred.append(int(torch.argmax(out).cpu().numpy()))
    preds.append(pred)

    

In [None]:
df = pd.DataFrame(preds).T

In [None]:
df= df.mode(axis=1)[0]

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['topic_idx'] = df
sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
#sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)

In [None]:
import time
import datetime
# 테스트 데이터 예측

#시작 시간 설정
t0 = time.time()

# 평가모드로 변경
model.eval()

pred = []
for step in range(len(test)):
    if step % 500 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step,len(test), elapsed))
    test_sentence = test.title[step]
    test_label = 0


    unseen_test = pd.DataFrame([[test_sentence, test_label]], columns = [['title', 'topic_idx']])
    #unseen_values = unseen_test.values
    test_set = BERTDataset(unseen_test, 0, 1, tok, max_len, True, False)
    test_input = torch.utils.data.DataLoader(test_set, batch_size=batch_size)

    for batch_id, items in enumerate(test_input):
        token_ids = items['token_ids'].to(device)
        segment_ids = items['segment_ids'].to(device)
        valid_length= items['valid_length']
        #label = items['label'].to(device)
        out = model(token_ids, valid_length, segment_ids)
        pred.append(int(torch.argmax(out).cpu().numpy()))

In [None]:
pred[0]

In [None]:
df

In [227]:
sub = pd.read_csv('sample_submission.csv')

In [228]:
sub['topic_idx'] = pred
#sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
#sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)

In [229]:
sol =  pd.read_csv('solution_sample.csv')
#sol2 = pd.read_csv('solution.csv')

In [230]:
sub2 = pred[0:len(sol)]

In [231]:
sol

Unnamed: 0,index,topic_idx
0,45654,3
1,45655,3
2,45656,2
3,45657,2
4,45658,3
...,...,...
4560,50214,1
4561,50215,2
4562,50216,4
4563,50217,5


In [232]:
from sklearn.metrics import accuracy_score
accuracy_score(sol.topic_idx, sub2)

0.8414019715224534