In [4]:
!pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
!pip install transformers accelerate


Looking in indexes: https://download.pytorch.org/whl/nightly/cpu
[31mERROR: Could not find a version that satisfies the requirement torch (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch[0m[31m
Collecting accelerate
  Downloading accelerate-1.10.0-py3-none-any.whl.metadata (19 kB)
INFO: pip is looking at multiple versions of accelerate to determine which version is compatible with other requirements. This could take a while.
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.8.0-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.5.2-py3-none-any.whl.metadata (19 kB)
  Downloading accelerate-1.5.1-py3-none-any.whl.metadata (19 kB)
INFO: pip is still looking at multiple versions of accelerate to determine

## Sentiment Analysis

In [24]:
from transformers import pipeline

In [25]:
sentiment_classifier = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


In [26]:
sentiment_classifier("I cannot stand the color pink. It gets me very upset") 

[{'label': 'NEGATIVE', 'score': 0.9962688684463501}]

In [27]:
sentiment_classifier("I love roses but hate flowers")

[{'label': 'POSITIVE', 'score': 0.9840420484542847}]

In [28]:
## NER (will specify model)

In [29]:
ner = pipeline("ner", model = "dslim/bert-base-NER")

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [30]:
ner("Dave Smith debates are great especially the one with Charlie Chaplin")

[{'entity': 'B-PER',
  'score': 0.99964964,
  'index': 1,
  'word': 'Dave',
  'start': 0,
  'end': 4},
 {'entity': 'I-PER',
  'score': 0.9991929,
  'index': 2,
  'word': 'Smith',
  'start': 5,
  'end': 10},
 {'entity': 'B-PER',
  'score': 0.99962246,
  'index': 10,
  'word': 'Charlie',
  'start': 53,
  'end': 60},
 {'entity': 'I-PER',
  'score': 0.99951124,
  'index': 11,
  'word': 'Chaplin',
  'start': 61,
  'end': 68}]

In [31]:
zeroshot_classifier = pipeline("zero-shot-classification", model = "facebook/bart-large-mnli")

Device set to use cpu


In [32]:
sequence_to_classify = "one day I will see the world"
candidate_labels = ['travel', 'dance', 'eat', 'explore']

In [33]:
zeroshot_classifier(sequence_to_classify, candidate_labels)

{'sequence': 'one day I will see the world',
 'labels': ['travel', 'explore', 'eat', 'dance'],
 'scores': [0.6575334668159485,
  0.3326416015625,
  0.005616443231701851,
  0.004208477679640055]}

## Pre-Trained Tokenizers

In [34]:
from transformers import AutoTokenizer

In [35]:
model = "bert-base-uncased"

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [37]:
sentence = "I am so excited to swim in the red sea"


In [38]:
input_ids = tokenizer(sentence)
print(input_ids)

{'input_ids': [101, 1045, 2572, 2061, 7568, 2000, 9880, 1999, 1996, 2417, 2712, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [39]:
# tokenizer breaks sentence into tokens
# the steps below is a step by step of what AutoTokenizer.from_pretrained() does

tokens = tokenizer.tokenize(sentence)
print(tokens)

['i', 'am', 'so', 'excited', 'to', 'swim', 'in', 'the', 'red', 'sea']


In [40]:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

[1045, 2572, 2061, 7568, 2000, 9880, 1999, 1996, 2417, 2712]


In [41]:
decoded_ids = tokenizer.decode(token_ids)
print(decoded_ids)

i am so excited to swim in the red sea


In [42]:
tokenizer.decode(101)

'[CLS]'

In [43]:
tokenizer.decode(102)

'[SEP]'

## Huggingface and Pytorch/Tensorflow

In [59]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

# Encode input
sentence = "I love working with Python!"
input_ids_pt = tokenizer(sentence, return_tensors='pt')

# Predict
with torch.no_grad():
    logits = model(**input_ids_pt).logits

predicted_class_id = logits.argmax().item()
label = model.config.id2label[predicted_class_id]
print(f"Predicted label: {label}")


Predicted label: POSITIVE
