In [2]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from kobert_tokenizer import KoBERTTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_metric

In [24]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [25]:
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and disrespectfulness at the formal dinner table"
target_sequence = "She made me angry when she was rude at dinner"

In [26]:
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt')

In [27]:
tokens

{'input_ids': tensor([[  101,  1153, 22296,  1143,  1114,  1123, 17073,  7640,   117, 24206,
           118,  9243,   117,  1105,  4267,  1116,  4894, 26426, 21047,  1120,
          1103,  4698,  4014,  1952,   102,  1153,  1189,  1143,  4259,  1165,
          1131,  1108, 14708,  1120,  4014,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [28]:
model(**tokens)

SequenceClassifierOutput(loss=None, logits=tensor([[0.1998, 1.0848]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [31]:
logits = model(**tokens).logits
logits

tensor([[0.1998, 1.0848]], grad_fn=<AddmmBackward0>)

In [35]:
result = torch.softmax(logits, dim=-1).tolist()[0]
torch.softmax(logits, dim=-1), F.softmax(logits, dim=-1)

(tensor([[0.2921, 0.7079]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.2921, 0.7079]], grad_fn=<SoftmaxBackward0>))

In [36]:
fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids[::2]]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids[::2]]

In [37]:
len(reviews)

1000

In [38]:
label_dict = {'neg':0, 'pos':1}
y = np.array([label_dict[c] for c in categories])
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size=0.3, random_state=0)
len(X_train), len(X_test)

(700, 300)

In [40]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [41]:
batch_size = 10
y_pred = []
num_batch = len(y_test) // batch_size

In [42]:
for i in range(num_batch):
    inputs = tokenizer(X_test[i*batch_size:(i+1)*batch_size], truncation=True, padding=True, return_tensors='pt')
    inputs = inputs.to(device)
    logits = model(**inputs).logits
    pred = torch.softmax(logits, dim=-1)
    results = pred.cpu().detach().numpy().argmax(axis=1)
    y_pred += results.tolist()

torch.cuda.empty_cache()

In [46]:
y_test == np.array(y_pred)

array([False,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True, False,
        True,  True, False, False,  True, False,  True, False,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,

In [44]:
score = sum(y_test == np.array(y_pred)) / len(y_test)
score

0.8066666666666666