In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertTokenizerFast, BertForSequenceClassification, BertModel
from transformers import Trainer, TrainingArguments
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from kobert_tokenizer import KoBERTTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import load_metric
import evaluate

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [3]:
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and disrespectfulness at the formal dinner table"
target_sequence = "She made me angry when she was rude at dinner"

In [4]:
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt')

In [5]:
tokens

{'input_ids': tensor([[  101,  1153, 22296,  1143,  1114,  1123, 17073,  7640,   117, 24206,
           118,  9243,   117,  1105,  4267,  1116,  4894, 26426, 21047,  1120,
          1103,  4698,  4014,  1952,   102,  1153,  1189,  1143,  4259,  1165,
          1131,  1108, 14708,  1120,  4014,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [6]:
model(**tokens)

SequenceClassifierOutput(loss=None, logits=tensor([[0.1998, 1.0848]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [7]:
logits = model(**tokens).logits
logits

tensor([[0.1998, 1.0848]], grad_fn=<AddmmBackward0>)

In [8]:
result = torch.softmax(logits, dim=-1).tolist()[0]
torch.softmax(logits, dim=-1), F.softmax(logits, dim=-1)

(tensor([[0.2921, 0.7079]], grad_fn=<SoftmaxBackward0>),
 tensor([[0.2921, 0.7079]], grad_fn=<SoftmaxBackward0>))

In [9]:
fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids[::2]]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids[::2]]

In [10]:
len(reviews)

1000

In [11]:
label_dict = {'neg':0, 'pos':1}
y = np.array([label_dict[c] for c in categories])
y[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
X_train_val, X_test, y_train_val, y_test = train_test_split(reviews, y, test_size=0.3, random_state=0)
len(X_train_val), len(X_test)

(700, 300)

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [14]:
batch_size = 10
y_pred = []
num_batch = len(y_test) // batch_size

In [15]:
for i in range(num_batch):
    inputs = tokenizer(X_test[i*batch_size:(i+1)*batch_size], truncation=True, padding=True, return_tensors='pt')
    inputs = inputs.to(device)
    logits = model(**inputs).logits
    pred = torch.softmax(logits, dim=-1)
    results = pred.cpu().detach().numpy().argmax(axis=1)
    y_pred += results.tolist()

torch.cuda.empty_cache()

In [16]:
y_test == np.array(y_pred)

array([False,  True,  True, False, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True, False,
        True, False,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True, False,  True,  True,  True,  True,
        True, False,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True, False,  True,  True,  True, False,  True,  True, False,
        True,  True, False, False,  True, False,  True, False,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True,  True,

In [17]:
score = sum(y_test == np.array(y_pred)) / len(y_test)
score

0.8066666666666666

In [18]:
sentence1 = "What a beautiful day!"
sentence2 = "Nvidia Titan XP has 12GB of VRAM"

In [19]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [20]:
tokens = tokenizer([sentence1, sentence2], padding=True, return_tensors='pt')
tokens

{'input_ids': tensor([[  101,  2054,  1037,  3376,  2154,   999,   102,     0,     0,     0,
             0,     0,     0],
        [  101,  1050, 17258,  2401, 16537, 26726,  2038,  2260, 18259,  1997,
         27830,  3286,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
tokens2 = tokenizer(sentence1, sentence2, padding=True, return_tensors='pt')
tokens2

{'input_ids': tensor([[  101,  2054,  1037,  3376,  2154,   999,   102,  1050, 17258,  2401,
         16537, 26726,  2038,  2260, 18259,  1997, 27830,  3286,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=0)
len(X_train), len(X_val), len(X_test)

(560, 140, 300)

In [23]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [24]:
train_input = tokenizer(X_train, truncation=True, padding=True, return_tensors='pt')
val_input = tokenizer(X_val, truncation=True, padding=True, return_tensors='pt')
test_input = tokenizer(X_test, truncation=True, padding=True, return_tensors='pt')

In [25]:
train_input['input_ids'].shape

torch.Size([560, 512])

In [26]:
class OurDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx]).clone().detach().long()
        return item
    def __len__(self):
        return len(self.labels)

In [54]:
train_dataset = OurDataset(train_input, y_train)
val_dataset = OurDataset(val_input, y_val)
test_dataset = OurDataset(test_input, y_test)

In [28]:
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [29]:
training_args = TrainingArguments(output_dir='./results', num_train_epochs=5,
                                  per_device_train_batch_size=16, per_device_eval_batch_size=16)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, compute_metrics=compute_metrics)

In [30]:
trainer.train()

***** Running training *****
  Num examples = 560
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 175
  Number of trainable parameters = 109483778


  0%|          | 0/175 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}


Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 8134.7469, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.022, 'train_loss': 0.2736242893763951, 'epoch': 5.0}


TrainOutput(global_step=175, training_loss=0.2736242893763951, metrics={'train_runtime': 8134.7469, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.022, 'train_loss': 0.2736242893763951, 'epoch': 5.0})

In [31]:
trainer.evaluate(eval_dataset=val_dataset)

***** Running Evaluation *****
  Num examples = 140
  Batch size = 16
  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}


  0%|          | 0/9 [00:00<?, ?it/s]

{'eval_loss': 0.6298284530639648,
 'eval_accuracy': 0.8571428571428571,
 'eval_runtime': 130.928,
 'eval_samples_per_second': 1.069,
 'eval_steps_per_second': 0.069,
 'epoch': 5.0}

In [55]:
y_pred = trainer.predict(test_dataset=test_dataset)
y_pred

***** Running Prediction *****
  Num examples = 300
  Batch size = 16
  item = {key: torch.tensor(val[idx]).clone().detach() for key, val in self.inputs.items()}


  0%|          | 0/19 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-2.2036018 ,  3.0777826 ],
       [-2.258605  ,  3.0359993 ],
       [ 1.723358  , -1.2343254 ],
       [-0.61543936,  1.2520275 ],
       [-1.7122723 ,  2.5018108 ],
       [-2.209709  ,  3.0260465 ],
       [ 3.03685   , -3.1605844 ],
       [ 3.0342586 , -2.9141135 ],
       [ 3.3118036 , -3.2655845 ],
       [-2.286319  ,  3.093957  ],
       [ 0.0825831 ,  0.48061898],
       [-2.226719  ,  3.1229498 ],
       [-2.2765694 ,  3.1148481 ],
       [ 3.4807668 , -3.4574647 ],
       [-2.2608256 ,  3.054328  ],
       [ 3.5191972 , -3.5098321 ],
       [-2.2009978 ,  2.978777  ],
       [ 3.4565313 , -3.3855968 ],
       [ 3.3440309 , -3.4467342 ],
       [ 0.6027546 , -0.05014998],
       [ 3.418641  , -3.4008975 ],
       [ 2.8446012 , -2.877937  ],
       [ 3.4141092 , -3.4840949 ],
       [ 3.3552303 , -3.4452705 ],
       [-2.070689  ,  2.924636  ],
       [-2.242586  ,  3.0965939 ],
       [-2.1123946 ,  2.9173203 ],
       [ 3.4261394 , -3.33

In [56]:
y_pred.label_ids, y_pred.label_ids.shape

(array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
        0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
        0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
        0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
        0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1,
        0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0,
        1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
        1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
        1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 

In [60]:
score = sum(y_test == np.array(y_pred.predictions.argmax(axis=-1))) / len(y_test)
score

0.84

In [59]:
y_pred.predictions.argmax(axis=-1)

array([1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0], d