In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model
import gluonnlp as nlp
from tqdm import tqdm
import numpy as np
import pandas as pd

In [5]:
device = torch.device("cpu")
bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /Users/minsung/Library/CloudStorage/OneDrive-Personal/data_study/opinion_mining_through_online_comments/2_model_training/.cache/kobert_v1.zip
using cached model. /Users/minsung/Library/CloudStorage/OneDrive-Personal/data_study/opinion_mining_through_online_comments/2_model_training/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [6]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)

        _, pooler = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                              attention_mask=attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)


In [8]:
max_len = 64
batch_size = 64

test_data = pd.read_csv("ratings_test.txt", sep='\t')

dataset_test = nlp.data.TSVDataset("ratings_test.txt", field_indices=[1, 2], num_discard_samples=1)

tokenizer = get_tokenizer()
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)
model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)

data_test = BERTDataset(dataset_test, 0, 1, tok, max_len, True, False)
test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)

predict_indices = []

using cached model. /Users/minsung/Library/CloudStorage/OneDrive-Personal/data_study/opinion_mining_through_online_comments/2_model_training/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


In [9]:
for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    label = label.long().to(device)
    out = model(token_ids, valid_length, segment_ids)
    _, max_indices = torch.max(out, 1)
    predict_indices = max_indices

###
#print(test_data)
for i, row in test_data.iterrows():
    print(row.document + " " + str(row.label) + " predict: " + str(int(predict_indices[i])))

  5%|▌         | 43/782 [12:52<3:45:00, 18.27s/it]