In [None]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('we love you'))

In [None]:
### sentiment-analysis
classifier = pipeline('sentiment-analysis')

In [None]:
res = classifier("The course was really great, the mentor explained every concept in detail")
res

In [None]:
### text-generation
generator = pipeline('text-generation', model='distilgpt2')


In [None]:
res = generator(
    "once upon a time there was a king ",
    max_length = 100,
    num_return_sequences=2
)

res

In [None]:
"""{'generated_text': 'once upon a time there was a king ㅠㅠㅠㅠㅠㅠㅠㅠㅠ㙠ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ�'},
 {'generated_text': "once upon a time there was a king \u0bfe\u0bfe\u0b81\u0bfe\u0bfe.\n‹ I have not even heard much of the history of what is happening in India. How shall we ever be informed of such an event, if any?\nI don't know, how far away we can be before the world's eye is set, but it will be on us. What does this mean? I think that it is because in a nation governed"}]"""

In [None]:
### zero-shot-classification
classifier = pipeline('zero-shot-classification')

In [None]:
res = classifier(
    "this is course about python list comprehension",
    candidate_labels = ['Education', 'politics', 'Engineering']
)

In [None]:
res

### Tokenizer / Model

In [1]:
from transformers import  pipeline
from transformers import  AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)



In [8]:
res = classifier('The restaurant is not terrible')
res

[{'label': 'POSITIVE', 'score': 0.9930605888366699}]

In [10]:
sq = "Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters."
toks = tokenizer(sq)
toks

{'input_ids': [101, 19204, 17629, 2015, 3975, 7953, 3793, 2046, 3760, 3197, 2170, 19204, 2015, 1012, 2122, 19204, 2015, 2064, 2022, 2616, 1010, 4942, 22104, 1010, 2030, 2130, 3494, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
tokenizer.decode(toks['input_ids'])

'[CLS] tokenizers split input text into smaller units called tokens. these tokens can be words, subwords, or even characters. [SEP]'

#### Pytorch

In [12]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [13]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [14]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [17]:
x_train = ["Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters.",
           "The goal is to convert raw text into a format that machine learning models can process. Since models typically work with numerical data, tokenizers play a crucial role in this conversion.",
           "Incredible Chinese, Japanese, and Sushi dishes. The ambiance is modern and chic.",
           "Can get crowded during peak hours.",
           "Elegant Indian cuisine with a colonial touch."]

In [18]:
#### Normal flow
classifier(x_train)

[{'label': 'NEGATIVE', 'score': 0.9945858716964722},
 {'label': 'NEGATIVE', 'score': 0.6151780486106873},
 {'label': 'POSITIVE', 'score': 0.9997819066047668},
 {'label': 'NEGATIVE', 'score': 0.9816345572471619},
 {'label': 'POSITIVE', 'score': 0.9997895359992981}]

In [19]:
#### with batches

batch = tokenizer(x_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
batch

{'input_ids': tensor([[  101, 19204, 17629,  2015,  3975,  7953,  3793,  2046,  3760,  3197,
          2170, 19204,  2015,  1012,  2122, 19204,  2015,  2064,  2022,  2616,
          1010,  4942, 22104,  1010,  2030,  2130,  3494,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1996,  3125,  2003,  2000, 10463,  6315,  3793,  2046,  1037,
          4289,  2008,  3698,  4083,  4275,  2064,  2832,  1012,  2144,  4275,
          4050,  2147,  2007, 15973,  2951,  1010, 19204, 17629,  2015,  2377,
          1037, 10232,  2535,  1999,  2023,  7584,  1012,   102],
        [  101,  9788,  2822,  1010,  2887,  1010,  1998, 10514,  6182, 10447,
          1012,  1996,  2572, 15599,  3401,  2003,  2715,  1998,  9610,  2278,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2064,  2131, 10789,  2076,  4672,  2847,  1012,   102,     0

In [23]:
#### inferance in pytorch

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)

    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)

    labels = torch.argmax(predictions, dim=1)
    print(labels)

SequenceClassifierOutput(loss=None, logits=tensor([[ 2.8426, -2.3707],
        [ 0.1965, -0.2726],
        [-4.0689,  4.3616],
        [ 2.1869, -1.7919],
        [-4.0949,  4.3708]]), hidden_states=None, attentions=None)
tensor([[9.9459e-01, 5.4140e-03],
        [6.1518e-01, 3.8482e-01],
        [2.1806e-04, 9.9978e-01],
        [9.8163e-01, 1.8365e-02],
        [2.1053e-04, 9.9979e-01]])
tensor([0, 0, 1, 0, 1])


#### Save / Load Tokenizer & Model

In [24]:
### saving
save_dir = './tokenzr' 
tokenizer.save_pretrained(save_directory=save_dir)
model.save_pretrained(save_dir)

In [25]:
### loading
tok = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


### **FineTune**

##### *Prepare dataset*

In [1]:
from datasets import load_dataset, load_from_disk
import os 
if os.path.exists('./yelp_review_full'):
    print("loading from disk : ./yelp_review_full")
    dataset = load_from_disk('./yelp_review_full')
else:
    dataset = load_dataset("yelp_review_full")
    dataset.save_to_disk("./yelp_review_full")

dataset["train"][100]

  from .autonotebook import tqdm as notebook_tqdm


loading from disk : ./yelp_review_full


{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. 

In [2]:
type(dataset["train"]), type(dataset["train"][0:10])

(datasets.arrow_dataset.Dataset, dict)

In [3]:
dataset["train"][1:10]

{'label': [1, 3, 3, 0, 4, 4, 0, 1, 2],
 'text': ["Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients with medical needs, why isn't anyone answering the phone?  It's incomprehensible and not work the aggravation.  It's with regret that I feel that I have to give Dr. Goldberg 2 stars.",
  "Been going to Dr. Goldberg for over 10 years. I think I was one of his 1st patients when he started at MHMG. He's been great over the years and is really all about the big picture. It is because of him, not my now former gyn Dr. Markoff, that I found out I have fibroids. He explores all options with 

##### *Tokenization*
Need tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths

In [4]:
from transformers import AutoTokenizer

`padding='max_length':`
the tokenizer pads the tokenized text with zeros (i.e., [PAD] tokens) to make all sequences in the batch have the same length.
If the original sentence length exceeds max_length after appending [CLS] and [SEP] tokens, padding is applied to reach the specified max_length.
For example, if you set max_length=10, the tokenized text might look like: [101, 2026, 2171, 2003, 11754, 102, 0, 0, 0, 0], where 101 represents the [CLS] token and 102 represents the [SEP] token.



`truncate=True:`
When truncate=True, longer sentences are truncated to exactly max_length.
This ensures that all input sequences have consistent lengths, which is crucial for tasks like classification.

In [5]:

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# To process dataset in one step, use Datasets map method to apply a preprocessing function over the entire dataset

if os.path.exists('./tokenized_datasets'):
    print("loading from disk : ./tokenized_datasets")
    tokenized_datasets = load_from_disk('./tokenized_datasets')
else:
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk("./tokenized_datasets")



Map: 100%|██████████| 50000/50000 [00:19<00:00, 2591.46 examples/s]
Saving the dataset (5/5 shards): 100%|██████████| 650000/650000 [00:02<00:00, 282291.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 50000/50000 [00:00<00:00, 304826.58 examples/s]
Map: 100%|██████████| 650000/650000 [04:05<00:00, 2645.93 examples/s]


In [6]:
### create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### *Train with PyTorch Trainer*

##### fine-tuning for sequence classification task

In [7]:
from transformers import AutoModelForSequenceClassification


### Start by loading your model and specify the number of expected labels
### There are 5 lables

if os.path.exists('./google-bert_bert-base-cased'):
    model = AutoModelForSequenceClassification.from_pretrained('./google-bert_bert-base-cased', num_labels=5)
else:
    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
    model.save_pretrained('./google-bert_bert-base-cased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#####  *Training hyperparameters*

Next, we have to create a TrainingArguments class which contains all the hyperparameters.
here we are using default training hyperparameters


In [8]:
### Specify where to save the checkpoints from your training:

from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="./test_trainer")

#### *Evaluate*<br>
`Trainer` does not automatically evaluate model performance during training. we need to pass Trainer a function to compute and report metrics.<br>
The `Evaluate` library provides a simple accuracy function you can load with the `evaluate.load()` function

In [10]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Call compute on metric to calculate the accuracy of your predictions. Before passing your predictions to compute, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

To monitor the evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

#### *Trainer*
Create a `Trainer` object with the model, training arguments, training and test datasets, and evaluation function

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [18]:
# Then fine-tune the model by calling train()
trainer.train()

In [15]:
trainer.save_model('./bert_base_cased_finetuned')

In [None]:
"""
{'eval_loss': 1.0237584114074707, 'eval_accuracy': 0.578, 'eval_runtime': 19.2998, 'eval_samples_per_second': 51.814, 'eval_steps_per_second': 6.477, 'epoch': 3.0}
{'train_runtime': 211.9045, 'train_samples_per_second': 14.157, 'train_steps_per_second': 1.77, 'train_loss': 1.033949951171875, 'epoch': 3.0}

TrainOutput(global_step=375, training_loss=1.033949951171875, metrics={'train_runtime': 211.9045, 'train_samples_per_second': 14.157, 'train_steps_per_second': 1.77, 'total_flos': 789354427392000.0, 'train_loss': 1.033949951171875, 'epoch': 3.0})"""