In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
sent = pipeline('sentiment-analysis')
result = sent('what a beautiful day!')[0]
result['label'], result['score']

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


('POSITIVE', 0.9998812675476074)

In [4]:
clf = pipeline('text-classification')
result = clf('what a beautiful day!')
result

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998812675476074}]

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

In [6]:
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and disrespectfulness at the formal dinner table"
target_sequence = "She made me angry when she was rude at dinner"

In [8]:
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt')
tokens

{'input_ids': tensor([[  101,  1153, 22296,  1143,  1114,  1123, 17073,  7640,   117, 24206,
           118,  9243,   117,  1105,  4267,  1116,  4894, 26426, 21047,  1120,
          1103,  4698,  4014,  1952,   102,  1153,  1189,  1143,  4259,  1165,
          1131,  1108, 14708,  1120,  4014,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [9]:
logits = model(**tokens).logits
logits

tensor([[0.1998, 1.0848]], grad_fn=<AddmmBackward0>)

In [12]:
results = torch.softmax(logits, dim=1).tolist()[0]
results

[0.29214248061180115, 0.7078575491905212]

In [13]:
for i, label in enumerate(['no', 'yes']):
    print(f"{label}: {int(round(results[i] * 100))}%")

no: 29%
yes: 71%


In [14]:
fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]

In [15]:
label_dict = {'pos':1, 'neg':0}
y = np.array([label_dict[c] for c in categories])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size=0.2, random_state=7)
len(X_train), len(X_test)

(1600, 400)

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [18]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = model.to(device)

In [19]:
batch_size = 10
y_pred = []
num_batch = len(y_test) // batch_size

In [20]:
for i in range(num_batch):
    inputs = tokenizer(X_test[i*batch_size:(i+1)*batch_size], truncation=True, padding=True, return_tensors='pt')
    inputs = inputs.to(device)
    logits = model(**inputs).logits
    pred = F.softmax(logits, dim=-1)
    results = pred.cpu().detach().numpy().argmax(axis=1)
    y_pred += results.tolist()
torch.cuda.empty_cache()

In [21]:
score = sum(y_test == np.array(y_pred)) / len(y_test)
score

0.8425