In [None]:
!pip install transformers[sentencepiece]



# Pipeline

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



## Inferencing

In [None]:
classifier("I am very excited to learn transformers and its applications")

[{'label': 'POSITIVE', 'score': 0.9996590614318848}]

## what if I have multiple sentences

In [None]:
sentences = ["I hate to go to the Office", "I love work from home culture"]
classifier(sentences)

[{'label': 'NEGATIVE', 'score': 0.9875333309173584},
 {'label': 'POSITIVE', 'score': 0.9996424913406372}]

# Zero-shot classification

In [None]:
classifier = pipeline('zero-shot-classification', model='MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli')

config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [None]:
classifier("This is about Deep Learning and Machine Learning",
           candidate_labels = ['Education', 'Sports', 'Business'])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'sequence': 'This is about Deep Learning and Machine Learning',
 'labels': ['Business', 'Education', 'Sports'],
 'scores': [0.41602611541748047, 0.36070844531059265, 0.2232653945684433]}

# Text Generation

In [None]:
classifier = pipeline('text-generation', model='distilgpt2')

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [None]:
classifier("In this Generative AI course, we will teach you how to")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'In this Generative AI course, we will teach you how to build a computer at a certain level. After this course, we will bring you a real world AI that works at different levels. This gives you the ability to build even better AI for'}]

# NER

In [None]:
classifier = pipeline('ner')

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]



In [None]:
sentences = ["My HCL's office is in Noida", "My DOB is 4th Jan 1997"]
classifier(sentences)

[[{'entity': 'I-ORG',
   'score': 0.9940374,
   'index': 2,
   'word': 'HC',
   'start': 3,
   'end': 5},
  {'entity': 'I-ORG',
   'score': 0.9827528,
   'index': 3,
   'word': '##L',
   'start': 5,
   'end': 6},
  {'entity': 'I-LOC',
   'score': 0.99741554,
   'index': 9,
   'word': 'No',
   'start': 22,
   'end': 24},
  {'entity': 'I-LOC',
   'score': 0.9977781,
   'index': 10,
   'word': '##ida',
   'start': 24,
   'end': 27}],
 []]

# Unwrap the pipeline

## 1. Preprocessing the text

In [None]:
# Let's understand the tokenization task
from transformers import AutoTokenizer
model = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model)



In [None]:
# Let's see the working of this tokenizer
raw_input = ["I am very excited for this transformer lecture",
             "The instructor is really bad"]

inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")

In [None]:
print(inputs)

{'input_ids': tensor([[  101,  1045,  2572,  2200,  7568,  2005,  2023, 10938,  2121,  8835,
           102],
        [  101,  1996,  9450,  2003,  2428,  2919,   102,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]])}


## Feeding the input to the model

In [None]:
from transformers import AutoModel
model_checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(model_checkpoint)

In [None]:
outputs = model(**inputs)
print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[ 5.0421e-01, -4.1946e-02,  7.5756e-01,  ...,  4.9913e-01,
           8.9441e-01, -5.7237e-01],
         [ 1.0543e+00,  2.2573e-01,  4.5794e-01,  ...,  2.5707e-01,
           9.2708e-01, -8.0059e-02],
         [ 7.2442e-01,  1.9034e-01,  6.0776e-01,  ...,  2.9204e-01,
           6.9594e-01, -2.1830e-01],
         ...,
         [ 2.5140e-01, -1.5345e-01,  7.7423e-01,  ...,  3.9547e-01,
           5.2717e-01, -3.5928e-01],
         [ 4.2898e-01, -3.1991e-02,  8.5150e-01,  ...,  6.3016e-01,
           7.7462e-01, -1.7858e-01],
         [ 1.3801e+00,  1.1302e-01,  5.0812e-01,  ...,  7.3679e-01,
           2.2382e-01, -6.4101e-01]],

        [[-8.3076e-01,  6.2542e-01,  8.8852e-02,  ..., -4.3956e-02,
          -9.4242e-01, -4.4228e-01],
         [-8.8570e-01,  6.5761e-01,  3.3794e-02,  ..., -1.5362e-01,
          -9.8196e-01, -3.7572e-01],
         [-5.6697e-01,  6.1117e-01,  6.8196e-02,  ..., -3.1362e-01,
          -9.9811e-01, -2.3049e-01],
     

In [None]:
from transformers import AutoModelForSequenceClassification
model_checkpoint = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [None]:
outputs = model(**inputs)
print(outputs.logits)

tensor([[-3.6561,  3.8631],
        [ 4.7657, -3.8259]], grad_fn=<AddmmBackward0>)


In [None]:
import torch
torch.nn.functional.softmax(outputs.logits)

  torch.nn.functional.softmax(outputs.logits)


tensor([[5.4229e-04, 9.9946e-01],
        [9.9981e-01, 1.8563e-04]], grad_fn=<SoftmaxBackward0>)

In [None]:
model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}