In [19]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm

from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

from transformers import AdamW
from adamp import AdamP
from transformers.optimization import get_cosine_schedule_with_warmup
from transformers import AutoTokenizer, ElectraForSequenceClassification, AdamW

##GPU 사용 시
device = torch.device("cuda:0")

In [20]:
import tensorflow as tf
import torch

from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification

import pandas as pd
import numpy as np
import random
import time
import datetime

In [21]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
  


In [22]:
torch.__version__

'1.9.0+cu111'

In [23]:
# 학습용 데이터셋 불러오기
import pandas as pd
# 판다스로 훈련셋과 테스트셋 데이터 로드
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_data.csv')

print(train.shape)
print(test.shape)

(45654, 3)
(9131, 2)


In [24]:
train = train.iloc[:,1:]
test = test.iloc[:,1:]

In [25]:
from konlpy.tag import Okt

In [8]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

In [9]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))
test['title'] = test['title'].apply(lambda x : func(x))

In [None]:
#bertmodel, vocab = get_pytorch_kobert_model()

In [26]:
# Train / Test set 분리
from sklearn.model_selection import train_test_split
train, valid = train_test_split(train, test_size=0.2, random_state=42)
print("train shape is:", len(train))
print("valid shape is:", len(valid))

train shape is: 36523
valid shape is: 9131


In [31]:
from transformers import ElectraModel, ElectraTokenizer

#model = ElectraModel.from_pretrained("monologg/koelectra-base-v3-discriminator")
tok = AutoTokenizer.from_pretrained("xlm-roberta-large")



In [32]:
class koelectradataset(Dataset):  
    def __init__(self, dataset,max_len,bert_tokenizer):
        
        self.tokenizer = bert_tokenizer
        self.dataset = dataset
  
    def __len__(self):
        return len(self.dataset)
  
    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, :2].values
        text = row[0]
        y = row[1]

        inputs = self.tokenizer(
            text, 
            return_tensors='pt',
            truncation=True,
            max_length=max_len,
            pad_to_max_length=True,
            add_special_tokens=True
            )
    
        input_ids = inputs['input_ids'][0]
        attention_mask = inputs['attention_mask'][0]

        return input_ids, attention_mask, y

In [33]:
max_len = 64
train_dataset = koelectradataset(train,max_len,tok)
test_dataset = koelectradataset(valid,max_len,tok)

In [34]:
from transformers import AutoConfig, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-large',num_labels=7).to(device)
#model = AutoModelForMaskedLM.from_pretrained("xlm-roberta-large")

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out

In [35]:
epochs = 10
batch_size = 64
learning_rate = 1e-5

In [36]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]

# 옵티마이저 선언
optimizer = AdamP(optimizer_grouped_parameters, lr=learning_rate)

In [37]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
losses = []
accuracies = []
model.train()
for i in range(epochs):
    print('#'*30,i+1,'epoch start','#'*30)
    total_loss = 0.0
    correct = 0
    total = 0
    batches = 0

    model.train()

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(train_loader):
        optimizer.zero_grad()
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        loss = F.cross_entropy(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        predicted = torch.argmax(y_pred, 1)
        correct += (predicted == y_batch).sum()
        total += len(y_batch)

        batches += 1
        if batches % 100 == 0:
            print("Batch Loss:", total_loss, "Accuracy:", correct.cpu().numpy() / total)

    losses.append(total_loss)
    accuracies.append(correct.cpu().numpy() / total)
    print("Train Loss:", sum(losses) / len(losses), "Train Accuracy:", sum(accuracies) / len(accuracies))
    
    model.eval()

    test_correct = 0
    test_total = 0

    for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
        y_batch = y_batch.to(device)
        y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
        predicted = torch.argmax(y_pred, 1)
        test_correct += (predicted == y_batch).sum()
        test_total += len(y_batch)

    print("valid  Accuracy:", test_correct.cpu().numpy() / test_total)
    print()
    print()
    

############################## 1 epoch start ##############################


  0%|          | 0/571 [00:00<?, ?it/s]



Batch Loss: 129.9117430150509 Accuracy: 0.51890625
Batch Loss: 178.63578420877457 Accuracy: 0.679296875


In [45]:
test['pred'] = 0
test_dataset = koelectradataset(test,max_len,tok)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [46]:
test

Unnamed: 0,title,pred
0,유튜브 내달 2일까지 크리에이터 지원 공간 운영,0
1,어버이날 맑다가 흐려져…남부지방 옅은 황사,0
2,내년부터 국가RD 평가 때 논문건수는 반영 않는다,0
3,김명자 신임 과총 회장 원로와 젊은 과학자 지혜 모을 것,0
4,회색인간 작가 김동식 양심고백 등 새 소설집 2권 출간,0
...,...,...
9126,인천 오후 3시35분 대설주의보…눈 3.1cm 쌓여,0
9127,노래방에서 지인 성추행 외교부 사무관 불구속 입건종합,0
9128,40년 전 부마항쟁 부산 시위 사진 2점 최초 공개,0
9129,게시판 아리랑TV 아프리카개발은행 총회 개회식 생중계,0


In [47]:
# 평가모드로 변경
model.eval()

pred = []
for input_ids_batch, attention_masks_batch, y_batch in tqdm(test_loader):
    y_batch = y_batch.to(device)
    y_pred = model(input_ids_batch.to(device), attention_mask=attention_masks_batch.to(device))[0]
    predicted = torch.argmax(y_pred, 1)
    #test_correct += (predicted == y_batch).sum()
    #test_total += len(y_batch)
    pred.extend(predicted.cpu().numpy())

  0%|          | 0/143 [00:00<?, ?it/s]



In [48]:
len(pred)

9131

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['topic_idx'] = df
sub['topic_idx'] = sub['topic_idx'].apply(lambda x : int(x))
sub.to_csv('hyup_전처리 x drop_out:0.7.csv',index=False)