In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
import gluonnlp as nlp
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
# GPU 확인
n_devices = torch.cuda.device_count()
print(n_devices)

for i in range(n_devices):
    print(torch.cuda.get_device_name(i))

1
NVIDIA GeForce RTX 3090


In [3]:
device = torch.device("cuda")
bertmodel, vocab = get_pytorch_kobert_model()

using cached model
using cached model


### Model Compile

In [4]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform(i[sent_idx]) for i in dataset.values]
        self.labels = [np.int32(i[label_idx]) for i in dataset.values]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [5]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                              attention_mask=attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [6]:
tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
model = BERTClassifier(bertmodel, dr_rate=0.5).to('cuda')
model.load_state_dict(torch.load('test01.pt'))

using cached model


<All keys matched successfully>

### Data Load & Fitting

In [7]:
import glob

In [8]:
max_len = 64
batch_size = 64

In [18]:
file_list = glob.glob('./test_data/in/*.csv')


for file in file_list:
  out_path = './test_data/out'
  out_file_name = file.split('\\')[-1].split('.')[0] + '_pred.xlsx'
  test_data = pd.read_csv(file)
  data_test = BERTDataset(test_data, 4, 0, tok, max_len, True, False)
  test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

  predict_indices = []
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)
      valid_length = valid_length
      label = label.long().to(device)
      out = model(token_ids, valid_length, segment_ids)
      _, max_indices = torch.max(out, 1)
      predict_indices += max_indices

  test_data['pred_label'] = predict_indices
  test_data['pred_label'] = test_data['pred_label'].astype('int')

  # out_path + out_file_name = './out/23123132_pred.xlsx'
  test_data.to_csv(out_path + out_file_name, index=False)

TypeError: 'float' object is not subscriptable