In [1]:
'''
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
'''
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [4]:
!pip3 install adamp



In [5]:
import pandas as pd
import numpy as np
import os

import transformers
from transformers import AutoTokenizer, AdamW, RobertaForSequenceClassification
from transformers import get_linear_schedule_with_warmup

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from tqdm.notebook import tqdm, tqdm_notebook

import random
import torch.backends.cudnn as cudnn

from sklearn.model_selection import StratifiedKFold

from adamp import AdamP

In [6]:
train_data = '/aiffel/aiffel/dktc/add_data/result.csv'
train = pd.read_csv(train_data)

test_data = '/aiffel/aiffel/dktc/add_data/test.csv'
test = pd.read_csv(test_data)

In [7]:
list1 = [(train['class']== "갈취 대화"), 
         (train['class']== "기타 괴롭힘 대화"), 
         (train['class']== "일반 대화"),
         (train['class']== "직장 내 괴롭힘 대화"),
         (train['class']== "협박 대화")]
choicelist1 = [0,1,2,3,4]
train['class']=np.select(list1, choicelist1)

train=train[['conversation','class']]
test=test[['conversation']]

In [8]:
class TRAINDataset(Dataset):
  
    def __init__(self, data):
        self.dataset = data
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

        print(self.dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 0:2].values
        sentence1 = row[0]
#         sentence2 = row[1]
        y = row[1]
        inputs = self.tokenizer(
            sentence1,
#             sentence2,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            add_special_tokens=True,
            max_length=100
        )

        input_ids = torch.from_numpy(np.asarray(inputs['input_ids']))
        attention_mask = torch.from_numpy(np.asarray(inputs['attention_mask']))

        return input_ids, attention_mask, y

In [9]:
class TESTDataset(Dataset):
  
    def __init__(self, data):
        self.dataset = data
        self.tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

        print(self.dataset)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx, 0:1].values
        sentence1 = row[0]
#         sentence2 = row[1]
        inputs = self.tokenizer(
            sentence1,
#             sentence2,
            truncation=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            add_special_tokens=True,
            max_length=100
        )

        input_ids = torch.from_numpy(np.asarray(inputs['input_ids']))
        attention_mask = torch.from_numpy(np.asarray(inputs['attention_mask']))

        return input_ids, attention_mask

In [10]:
def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

In [11]:
device = torch.device("cuda")

In [12]:
# 파라미터
epochs = 5
batch_size = 8

In [13]:
# 모델 학습 및 검증
def training(train_dataset,val_dataset, fold):
    best_acc = 0

    model = RobertaForSequenceClassification.from_pretrained("klue/roberta-large", num_labels=5).to(device)

    dataset_train = TRAINDataset(train_dataset)
    dataset_val = TRAINDataset(val_dataset)

    train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False)

    optimizer = AdamP(model.parameters(), lr=1e-5, betas=(0.9, 0.999), weight_decay=1e-2)

    total_steps = len(train_loader) * epochs

    # 스케줄러
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

    for e in range(epochs):
        train_acc = 0.0
        valid_acc = 0.0
        model.train()
        for batch_id, (token_ids, attention_masks, label) in tqdm(enumerate(train_loader), total=len(train_loader)):
            optimizer.zero_grad()
            token_ids = token_ids.to(device)
            attention_masks = attention_masks.to(device)
            label = label.to(device)
            out = model(token_ids, attention_masks)[0]
            loss = F.cross_entropy(out, label)
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_acc += calc_accuracy(out, label)

        print("epoch {} train acc {}".format(e+1, train_acc / (batch_id+1)))

        model.eval()
        for batch_id, (token_ids, attention_masks, label) in tqdm(enumerate(valid_loader), total=len(valid_loader)):
            token_ids = token_ids.to(device)
            attention_masks = attention_masks.to(device)
            label = label.to(device)
            out = model(token_ids, attention_masks)[0]
            valid_acc += calc_accuracy(out, label)
        print("epoch {} valid acc {}".format(e+1, valid_acc / (batch_id+1)))
            #    if valid_acc > best_acc:
            #      torch.save(model, '/content/drive/MyDrive/한국어 문장 관계 분류 경진대회/open/model'+str(fold)+'.pt')
        torch.save(model, '/aiffel/aiffel/dktc/add_data/model'+str(fold)+'.pt')

In [14]:
# 교차검증
def main():
    seed= 2021 # 재현성을 위한 시드값 고정
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = False  # type: ignore

    # kfold
    kfold=[]
    # StratifiedKFold : 불균형한 분포도를 가진 레이블 데이터 집합을 위한 KFold 방식
    splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
    for train_idx, val_idx in splitter.split(train.iloc[:, 0:1],train.iloc[:, 1]):
        kfold.append((train.iloc[train_idx,:],train.iloc[val_idx,:]))

    for fold,(train_datasets, valid_datasets) in enumerate(kfold):
        print(f'fold{fold} 학습중...')
        training(train_dataset=train_datasets,val_dataset=valid_datasets,fold=fold)

In [None]:
main() #학습 시작

fold0 학습중...


Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifi

                                           conversation  class
0     지금 너 스스로를 죽여달라고 애원하는 것인가?\n 아닙니다. 죄송합니다.\n 죽을 ...      4
1     길동경찰서입니다.\n9시 40분 마트에 폭발물을 설치할거다.\n네?\n똑바로 들어 ...      4
2     너 되게 귀여운거 알지? 나보다 작은 남자는 첨봤어.\n그만해. 니들 놀리는거 재미...      1
5     나 이틀뒤에 가나다 음식점 예약좀 해줘. 저녁7시로.\n가나다 음식점이요.?\n응....      3
6     35번 손님 아이스커피 두잔나왔습니다\n아이스커피? \n네 맛있게드세요\n저기요 아...      1
...                                                 ...    ...
4945  한달살기로 가보고 싶은 지역이 있어?\n 우리나라 중에서 말하는거야?\n 국내, 해...      2
4946  그런 스트레스는 키키 재밌는 드라마나 영화나 예능이 최고지!\n 난 요즘 갯마을 차...      2
4947  나 저번에 소개팅했던 남자한테 연락이 왔어\n 그래? 뭐라고 왔니?\n 궁금하다 저...      2
4948  우리나라 자국산 전투기 만드는거 알아?\n 전투기를 왜 만드는거야?\n 나라를 지키...      2
4949  그래도 출퇴근이라도 편하면 좋지 키키\n 뭐... 그게 나쁜 건 아니지...\n 회...      2

[3960 rows x 2 columns]
                                           conversation  class
3     어이 거기\n예??\n너 말이야 너. 이리 오라고\n무슨 일.\n너 옷 좋아보인다?...      0
4     저기요 혹시 날이 너무 뜨겁잖아요? 저희 회사에서 이 선크림 파는데 한 번 손등에 ...      0
14    김사원\n대리님 왜 그러세요?\n이거 오늘까

  0%|          | 0/495 [00:00<?, ?it/s]



epoch 1 train acc 0.8409090909090909


  0%|          | 0/124 [00:00<?, ?it/s]

epoch 1 valid acc 0.8971774193548387


  0%|          | 0/495 [00:00<?, ?it/s]

In [None]:
# 예측 
def inference(model, dataset_test):
    test_dataset = TESTDataset(dataset_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    output_pred = []
    with torch.no_grad():
        for batch_id, (token_ids, attention_masks) in tqdm(enumerate(test_loader), total=len(test_loader)):
            token_ids = token_ids.long().to(device)
            attention_masks = attention_masks.long().to(device)
            output=model(token_ids, attention_masks)[0]
            logits = torch.nn.functional.softmax(output, dim=1).detach().cpu().numpy()
            output_pred.extend(logits)
    return output_pred

In [None]:
'''
list1 = [(train['class']== "갈취 대화"), 
         (train['class']== "기타 괴롭힘 대화"), 
         (train['class']== "일반 대화"),
         (train['class']== "직장 내 괴롭힘 대화"),
         (train['class']== "협박 대화")]
choicelist1 = [0,1,2,3,4]
'''
label_dict = {"갈취 대화" : 0, 
              "기타 괴롭힘 대화" : 1, 
              "일반 대화" : 2,
              "직장 내 괴롭힘 대화" : 3,
              "협박 대화" : 4                            
             }

In [None]:
# 결과 도출
def inference_main():
    res = np.zeros((len(test),3)) 
    for i in range(5): 
        print(f'fold{i} 모델 추론중...')
        # load my model
        model = torch.load('/aiffel/aiffel/dktc/add_data/model'+str(i)+'.pt')

        pred_answer = inference(model, test)

        res += np.array(pred_answer) / 5 

        ans= np.argmax(res, axis=-1)
        out = [list(label_dict.keys())[_] for _ in ans]
        submission["label"] = out

In [None]:
inference_main()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")
tokenizer("힛걸 진심 최고로 멋지다.", "힛걸 진심 최고다 그 어떤 히어로보다 멋지다")