## 라이브러리 로딩

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW

##GPU 사용 시
device = torch.device("cuda:1")

In [6]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import random
import time
import datetime

In [7]:
torch.__version__

'1.9.0+cu111'

In [8]:
# 학습용 데이터셋 불러오기
import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)

(45654, 3)
(9131, 2)


In [9]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

## 전처리

In [11]:
from konlpy.tag import Okt

# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))
test['title'] = test['title'].apply(lambda x : func(x))

In [14]:
# Train / Test set 분리
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train, test_size=0.2, random_state=42)
print("train shape is:", len(train))
print("valid shape is:", len(valid))

train shape is: 36523
valid shape is: 9131


In [15]:
from transformers import AutoTokenizer ,AutoModelForSequenceClassification

#model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tok = AutoTokenizer.from_pretrained("klue/roberta-large")
model = AutoModelForSequenceClassification.from_pretrained('klue/roberta-large',num_labels=7).cuda()

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'cl

## 커스텀 데이터셋

In [16]:
class robertaadataset(Dataset):  
    def __init__(self, dataset,max_len,bert_tokenizer):
        
        self.tokenizer = bert_tokenizer
        self.dataset = dataset
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, :2].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=max_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [17]:
batch_size = 64
max_len = 32
num_epochs = 10
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

In [18]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)

In [19]:
train_dataset = robertaadataset(train,max_len,tok)
test_dataset = robertaadataset(valid,max_len,tok)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [20]:
losses = []
accuracies = []
model.train()
for i in range(3):
    print('#'*30,i+1,'epoch start','#'*30)
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.cuda()
        y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted = torch.argmax(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.cpu().numpy() / total)

    losses.append(total_loss)
    accuracies.append(correct.cpu().numpy() / total)
    print("Train Loss:", sum(losses) / len(losses), "Train Accuracy:", sum(accuracies) / len(accuracies))
    
    model.eval()

    test_correct = 0
    test_total = 0

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
        y_batch = y_batch.cuda()
        y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
        predicted = torch.argmax(y_pred, 1)
        test_correct += (predicted == y_batch).sum()
        test_total += len(y_batch)

    print("valid  Accuracy:", test_correct.cpu().numpy() / test_total)
    print()
    print()
 
    

############################## 1 epoch start ##############################


  0%|          | 0/571 [00:00<?, ?it/s]

Batch Loss: 63.34794284403324 Accuracy: 0.7875
Batch Loss: 107.88995760679245 Accuracy: 0.8225
Batch Loss: 150.9573018103838 Accuracy: 0.8351041666666666
Batch Loss: 191.8354904204607 Accuracy: 0.8417578125
Batch Loss: 231.4056955575943 Accuracy: 0.84578125
Train Loss: 259.1287747323513 Train Accuracy: 0.8485885606330258


  0%|          | 0/143 [00:00<?, ?it/s]

valid  Accuracy: 0.8813930566203044


############################## 2 epoch start ##############################


  0%|          | 0/571 [00:00<?, ?it/s]

Batch Loss: 30.928764760494232 Accuracy: 0.89640625
Batch Loss: 60.75272077322006 Accuracy: 0.8971875
Batch Loss: 90.18083705753088 Accuracy: 0.8990625
Batch Loss: 122.78986214101315 Accuracy: 0.8965234375
Batch Loss: 154.28778317570686 Accuracy: 0.89596875
Train Loss: 218.07364953681827 Train Accuracy: 0.8722996467979082


  0%|          | 0/143 [00:00<?, ?it/s]

valid  Accuracy: 0.878874164932647


############################## 3 epoch start ##############################


  0%|          | 0/571 [00:00<?, ?it/s]

Batch Loss: 22.464916042983532 Accuracy: 0.92640625
Batch Loss: 45.52330815792084 Accuracy: 0.92453125
Batch Loss: 70.19599305838346 Accuracy: 0.92171875
Batch Loss: 94.41127549856901 Accuracy: 0.920546875
Batch Loss: 120.42761645466089 Accuracy: 0.9185625
Train Loss: 192.87014197309813 Train Accuracy: 0.8869753306135859


  0%|          | 0/143 [00:00<?, ?it/s]

valid  Accuracy: 0.8746030007666192




In [21]:
test['pred'] = 0
test_dataset = koelectradataset(test,max_len,tok)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [22]:
test

Unnamed: 0,title,pred
0,유튜브 내달 2일 까지 크리에이터 지원 공간 운영,0
1,어버이날 맑다 흐려지다 남부 지방 옅다 황사,0
2,내년 국가 RD 평가 때 논문 건수 반영 않다,0
3,김명자 신임 과총 회장 원로 젊다 과학자 지혜 모으다 것,0
4,회색 인간 작가 김 동식 양심 고백 등 새 소설 집 2 권 추다 간,0
...,...,...
9126,인천 오후 3시 35분 대설주의보 눈 3.1 cm 쌓이다,0
9127,노래방 지인 성추행 외교부 사무관 불구속 입건 종합,0
9128,40년 전 부마항쟁 부산 시위 사진 2 점 최초 공개,0
9129,게시판 아리랑 TV 아프리카 개발 은행 총회 개회 식 생중계,0


In [23]:
# 평가모드로 변경
model.eval()

pred = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.cuda()
    y_pred = model(input_ids_batch.cuda(), attention_mask=attention_masks_batch.cuda())[0]
    predicted = torch.argmax(y_pred, 1)
    #test_correct += (predicted == y_batch).sum()
    #test_total += len(y_batch)
    pred.extend(predicted.cpu().numpy())

  0%|          | 0/143 [00:00<?, ?it/s]

In [24]:
len(pred)

9131

In [30]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['topic_idx'] = df
sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
#sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)