In [1]:
import nltk
from nltk.corpus import movie_reviews

from sklearn.model_selection import train_test_split

import numpy as np

import torch
import torch.nn.functional as F

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [3]:
clf = pipeline('sentiment-analysis', framework='pt')
result = clf('what a beautiful day!')[0]
print(clf.device, result['label'], result['score'])

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


cuda:0 POSITIVE 0.9998812675476074


In [4]:
text_gen = pipeline('text-generation', framework='pt')
result = text_gen('Alice was beginning to get very tired of sitting by her sister on the bank, ')
print(result[0]['generated_text'])

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Alice was beginning to get very tired of sitting by her sister on the bank,  she started to feel like she was doing something wrong.  Then there was the time of day... the sun was out and the wind was blowing.  (She was still in the sunshade for lunch!)  When she finally got her morning light out she started to feel like she was on a roll.  She was really tired and tired, and started to feel like she was going to die.  She was dying as soon as she was exposed to the sun.  (She was dying as soon as she was exposed to the sun!)  She was tired, hungry and tired.  She was hungry and hungry.  She was dying as soon as she was exposed to the sun.
We all know there is a lot of stress in our lives and it makes us really sick to think so much about it.  It can be scary and it can be very hard to understand.  It can be a really tough time for you.  You really need to get over it and get over the thing.  If you have not, then you're just going to become a victim of someone who cares too much and i

In [5]:
model_name = 'bert-base-cased-finetuned-mrpc'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

In [6]:
input_sentence = "She angered me with her inappropriate comments, rumor-spreading, and disrespectfulness at the formal dinner table"
target_sequence = "She made me angry when she was rude at dinner"
target_sequence_2 = "The boy quickly ran across the finish line, seizing yet another victory"

In [7]:
tokens = tokenizer(input_sentence, target_sequence, return_tensors='pt').to(device)
logits = model(**tokens).logits
results = torch.softmax(logits, dim=1).tolist()[0]
for i, label in enumerate(['no', 'yes']):
    print(f'{label}: {int(round(results[i] * 100))}%')

print(tokens)
print(logits)
print(results)

no: 29%
yes: 71%
{'input_ids': tensor([[  101,  1153, 22296,  1143,  1114,  1123, 17073,  7640,   117, 24206,
           118,  9243,   117,  1105,  4267,  1116,  4894, 26426, 21047,  1120,
          1103,  4698,  4014,  1952,   102,  1153,  1189,  1143,  4259,  1165,
          1131,  1108, 14708,  1120,  4014,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[0.1998, 1.0848]], device='cuda:0', grad_fn=<AddmmBackward0>)
[0.2921431362628937, 0.7078568935394287]


In [8]:
tokens = tokenizer(input_sentence, target_sequence_2, return_tensors='pt').to(device)
logits = model(**tokens).logits
results = torch.softmax(logits, dim=1).tolist()[0]
for i, label in enumerate(['no', 'yes']):
    print(f'{label}: {int(round(results[i] * 100))}%')

print(tokens)
print(logits)
print(results)

no: 95%
yes: 5%
{'input_ids': tensor([[  101,  1153, 22296,  1143,  1114,  1123, 17073,  7640,   117, 24206,
           118,  9243,   117,  1105,  4267,  1116,  4894, 26426, 21047,  1120,
          1103,  4698,  4014,  1952,   102,  1109,  2298,  1976,  1868,  1506,
          1103,  3146,  1413,   117, 14516,  4404,  1870,  1330,  2681,   102]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[ 0.6605, -2.2799]], device='cuda:0', grad_fn=<AddmmBackward0>)
[0.9498079419136047, 0.05019203573465347]


In [9]:
fileids = movie_reviews.fileids()
reviews = [movie_reviews.raw(fileid) for fileid in fileids]
categories = [movie_reviews.categories(fileid)[0] for fileid in fileids]
label_dict = {'pos':1, 'neg':0}
y = np.array([label_dict[c] for c in categories])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(reviews, y, test_size=0.2, random_state=7)

In [11]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model = model.to(device)

In [14]:
batch_size = 20
y_pred = []

num_batch = len(y_test) // batch_size

for i in range(num_batch):
    inputs = tokenizer(X_test[i*batch_size:(i+1)*batch_size], truncation=True, padding=True, return_tensors='pt').to(device)
    logits = model(**inputs).logits

    pred = torch.softmax(logits, dim=-1)
    results = pred.cpu().detach().numpy().argmax(axis=1)

    y_pred.extend(results.tolist())

torch.cuda.empty_cache()

In [15]:
score = sum(y_test == np.array(y_pred)) / len(y_test)
print(score)

0.8425
