In [None]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('we love you'))

In [None]:
### sentiment-analysis
classifier = pipeline('sentiment-analysis')

In [None]:
res = classifier("The course was really great, the mentor explained every concept in detail")
res

In [None]:
### text-generation
generator = pipeline('text-generation', model='distilgpt2')


In [None]:
res = generator(
    "once upon a time there was a king ",
    max_length = 100,
    num_return_sequences=2
)

res

In [None]:
"""{'generated_text': 'once upon a time there was a king ㅠㅠㅠㅠㅠㅠㅠㅠㅠ㙠ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ�'},
 {'generated_text': "once upon a time there was a king \u0bfe\u0bfe\u0b81\u0bfe\u0bfe.\n‹ I have not even heard much of the history of what is happening in India. How shall we ever be informed of such an event, if any?\nI don't know, how far away we can be before the world's eye is set, but it will be on us. What does this mean? I think that it is because in a nation governed"}]"""

In [None]:
### zero-shot-classification
classifier = pipeline('zero-shot-classification')

In [None]:
res = classifier(
    "this is course about python list comprehension",
    candidate_labels = ['Education', 'politics', 'Engineering']
)

In [None]:
res

### Tokenizer / Model

In [1]:
from transformers import  pipeline
from transformers import  AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)



In [8]:
res = classifier('The restaurant is not terrible')
res

[{'label': 'POSITIVE', 'score': 0.9930605888366699}]

In [10]:
sq = "Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters."
toks = tokenizer(sq)
toks

{'input_ids': [101, 19204, 17629, 2015, 3975, 7953, 3793, 2046, 3760, 3197, 2170, 19204, 2015, 1012, 2122, 19204, 2015, 2064, 2022, 2616, 1010, 4942, 22104, 1010, 2030, 2130, 3494, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.decode(toks['input_ids'])

'[CLS] tokenizers split input text into smaller units called tokens. these tokens can be words, subwords, or even characters. [SEP]'

#### Pytorch

In [12]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [13]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [14]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [17]:
x_train = ["Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters.",
           "The goal is to convert raw text into a format that machine learning models can process. Since models typically work with numerical data, tokenizers play a crucial role in this conversion.",
           "Incredible Chinese, Japanese, and Sushi dishes. The ambiance is modern and chic.",
           "Can get crowded during peak hours.",
           "Elegant Indian cuisine with a colonial touch."]

In [18]:
#### Normal flow
classifier(x_train)

[{'label': 'NEGATIVE', 'score': 0.9945858716964722},
 {'label': 'NEGATIVE', 'score': 0.6151780486106873},
 {'label': 'POSITIVE', 'score': 0.9997819066047668},
 {'label': 'NEGATIVE', 'score': 0.9816345572471619},
 {'label': 'POSITIVE', 'score': 0.9997895359992981}]

In [19]:
#### with batches

batch = tokenizer(x_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
batch

{'input_ids': tensor([[  101, 19204, 17629,  2015,  3975,  7953,  3793,  2046,  3760,  3197,
          2170, 19204,  2015,  1012,  2122, 19204,  2015,  2064,  2022,  2616,
          1010,  4942, 22104,  1010,  2030,  2130,  3494,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  3125,  2003,  2000, 10463,  6315,  3793,  2046,  1037,
          4289,  2008,  3698,  4083,  4275,  2064,  2832,  1012,  2144,  4275,
          4050,  2147,  2007, 15973,  2951,  1010, 19204, 17629,  2015,  2377,
          1037, 10232,  2535,  1999,  2023,  7584,  1012,   102],
        [  101,  9788,  2822,  1010,  2887,  1010,  1998, 10514,  6182, 10447,
          1012,  1996,  2572, 15599,  3401,  2003,  2715,  1998,  9610,  2278,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2064,  2131, 10789,  2076,  4672,  2847,  1012,   102,     0

In [23]:
#### inferance in pytorch

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)

    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)

    labels = torch.argmax(predictions, dim=1)
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.8426, -2.3707],
        [ 0.1965, -0.2726],
        [-4.0689,  4.3616],
        [ 2.1869, -1.7919],
        [-4.0949,  4.3708]]), hidden_states=None, attentions=None)
tensor([[9.9459e-01, 5.4140e-03],
        [6.1518e-01, 3.8482e-01],
        [2.1806e-04, 9.9978e-01],
        [9.8163e-01, 1.8365e-02],
        [2.1053e-04, 9.9979e-01]])
tensor([0, 0, 1, 0, 1])


#### Save / Load Tokenizer & Model

In [24]:
### saving
save_dir = './tokenzr' 
tokenizer.save_pretrained(save_directory=save_dir)
model.save_pretrained(save_dir)

In [25]:
### loading
tok = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


#### FineTune