In [None]:
# Set environment variables, this must be done before importing transformers.
from dotenv import load_dotenv
load_dotenv(override=True)

import os
import torch

if "TRANSFORMERS_OFFLINE" in os.environ and int(os.environ["TRANSFORMERS_OFFLINE"]):
    print("Using cached models from", os.environ["HF_HOME"])
else:
    print("Loading model from huggingface hub and saving to", os.environ["HF_HOME"])

In [None]:
import transformers

# Why Huggingface?
- Open source.
- A vast repository of pre-trained models across various domains.
- Compitable with Tensorflow, Pytorch and JAX.
- A community, not just a toolkit.
- Supports research and engineering.
- Fine-tuning capabilities.

https://huggingface.co/

Note: Huggingface can be a bit verbose with various warnings. You will see some of them when running this tutorial, but you do not need to worry about them for now.
Most of them are quite harmless and are mainly there to help you improve and optimize your Huggingface usage.
If you'd like to disable them, you can run the cell below.

In [None]:
# Optional: disable warnings.
# import warnings
# warnings.filterwarnings("ignore")

# Simplest  approach to use huggingface/transformers for inference: pipeline class

## Sentiment analysis with DistilBERT

In [None]:
pipeline = transformers.pipeline("sentiment-analysis")

inputs = ["What a lovely day today!", "It is freezing outside."]

results = pipeline(inputs)

print("Results:", results)

### **Exercise 1**: Try some sentences on your own.

In [None]:
# **********************************************************
YOUR_SENTENCES_HERE = ["", ""]
# **********************************************************

results = pipeline(YOUR_SENTENCES_HERE)

print("Results:", results)

## Text generation with GPT-2

In [None]:
pipeline = transformers.pipeline("text-generation")

input_text = "The capital of France is"

output = pipeline(input_text, truncation=True, max_length=50)
generated_text = output[0]["generated_text"]
print("Generated text:", generated_text)

### **Exercise 2**: Can you think of a way to prompt GPT-2 that would make it more likely to continue the text with Paris?

For possible arguments, see the `pipeline()` call in the previous cell (or the full list of arguments of the `__call__` function of the `TextGenerationPipeline` class [here](https://huggingface.co/docs/transformers/en/main_classes/pipelines#transformers.TextGenerationPipeline.__call__)).  
Note that this model is not instruction-tuned, so it won't always be possible to get it to do what you want.

In [None]:
# **********************************************************
YOUR_PROMPT_HERE = ...
YOUR_ARGUMENTS_HERE = ...
# **********************************************************

pipeline = transformers.pipeline("text-generation")

input_text = YOUR_PROMPT_HERE + "The capital of France is"

output = pipeline(input_text, **YOUR_ARGUMENTS_HERE)
generated_text = output[0]["generated_text"]
print("Generated text:", generated_text)

# Decompose the pipeline

## Behind the scenes overview
What happens behind the scenes when `pipeline(inputs)` is called:

![sentiment analysis pipeline](img/sentiment_analysis_pipeline.svg)

The **tokenizer** splits the input into words, subwords, or symbols (like punctuation) that are called *tokens*. It then maps each token to an integer ID, and adds additional inputs that may be useful to the model, such as beginning-of-sequence and end-of-sequence tokens. The token sequence is passed through the model, after which task-specific post-processing is applied.

All preprocessing needs to be done in exactly the same way as when the model was pretrained. Conveniently, this information can be accessed from the pipeline instance:

In [None]:
# Print relevant tokenizer information
print("Tokenizer Name:", pipeline.tokenizer.name_or_path)
print("Vocabulary Size:", pipeline.tokenizer.vocab_size)
print("Max Model Input Sizes:", pipeline.tokenizer.model_max_length)
print("Special Tokens:", pipeline.tokenizer.special_tokens_map)

In [None]:
# Print the model architecture
pipeline.model

In [None]:
# Print the model config
pipeline.model.config

## Tokenization

Tokenizers prepare text data for processing by Transformer models. 

**Tokenizers' function**:

1. Text -> tokens: the tokenizer splits the input into words, subwords, or symbols (like punctuation) that become tokens.

2. Tokens -> IDs: each token is mapped to a unique integer ID.
3. Special tokens are added: 
    - BERT models use [CLS] at the beginning of the input for classification tasks and [SEP] to separate different segments in the input. 
        - In model pre-training, certain words in the input are replaced with the [MASK] token. The model then learns to predict the original value of these masked tokens, which helps in learning context and word relationships.
    - When the tokenizer encounters a word that is not in its vocabulary, it replaces it with the [UNK] (unknown) token. This is a way to handle out-of-vocabulary words.
    - GPT models use [BOS] indicates the start, and [EOS] marks the end of a text sequence. 
4. Handling Fixed Sequence Lengths: Transformer models require inputs of a fixed length across a batch. Tokenizers pad shorter inputs with [PAD] tokens and truncate longer ones to meet the model's length requirements.

5. Attention Mask: The tokenizer generates an attention mask to differentiate real tokens from padding tokens ([PAD]) such that the model will pay attention only to the relevant parts of the input.


For multilingual models, tokenizers also ensure consistent tokenization across different languages, maintaining a balanced and shared vocabulary.



There are three types of tokenizers: **Word-based, Subword-based, and Character-based**.



### Most state-of-the-art models use subword-based tokenizers:

- BERT (Bidirectional Encoder Representations from Transformers): Uses the WordPiece tokenizer.

- GPT-2 and GPT-3 (Generative Pre-trained Transformer): Utilize a variant of Byte Pair Encoding (BPE).

- T5 (Text-To-Text Transfer Transformer): Employs the SentencePiece tokenizer, which is versatile and can be used across different languages and scripts.

In [None]:
from transformers import BertTokenizer

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Example text
text = "Hello, how many GPUs do you need?"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print(tokens)

# Convert tokens to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
from transformers import GPT2Tokenizer

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Example text
text = "Hello, how many GPUs do you need?"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print(tokens)

# Convert tokens to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

In [None]:
from transformers import T5Tokenizer

# Initialize the tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")

# Example text
text = "Hello, how many GPUs do you need?"

# Tokenize the text
tokens = tokenizer.tokenize(text,add_special_tokens=True)
print(tokens)

# Convert tokens to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(token_ids)

Each of these tokenizers will typically include a leading space to tokens at the start of words. This may be denoted by `_`, `Ġ`, or by the lack of `##` in the example token sequences printed above.

**NOTE: A pretrained model only performs properly when the input was tokenized under the same rules that its training data were tokenized.**

### Tokenizer Classes in Huggingface:
- PreTrainedTokenizer: base class for all tokenizers. It provides common methods and attributes that are shared across various tokenizer types. It's not typically used directly for loading specific model tokenizers.
- Specifically designed tokenizer, for example: BertTokenizer for the BERT model. It inherits from PreTrainedTokenizer.


In [None]:
from transformers import PreTrainedTokenizer

# Directly call a PreTrainedTokenizer, this will throw errors.
tokenizer = PreTrainedTokenizer.from_pretrained("bert-base-uncased")
encoded_input = tokenizer("Hello, Huggingface!")


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", padding=True, truncation=True, max_length=20)

# Example text
text = "What is the capital of Finland?"

# Tokenize the text
tokens = tokenizer.tokenize(text)
print(tokens)

Hyperparameters in tokenizer:

- padding: padding Strategy
- truncate: truncation Strategy
- max_length: 
- ...

**NOTE:** Call the tokenizer directly when preparing data for model input (like training or inference). The tokenize() method is useful for a token-level inspection or manipulation of the text.

Hyperparameters like `padding`, `truncate`, `max_length` are not recognized by tokenize() method.

In [None]:
text = ["Hello, Huggingface! Tell me about all your tokenizer types.", "Hello, world!"]

# call a tokenizer directly, invoking its __call__ method
encoded_input = tokenizer(text, padding=True, truncation=True, max_length=20) 
for item in encoded_input.items():
    print(item)

## Model
### Huggingface Model Classes:
https://huggingface.co/docs/transformers/model_doc/auto
- **Base model**:

A base model, also referred to as a pretrained model, is a language model that has been trained on a large, generic dataset. The primary purpose of a base model is to capture a wide range of language features and semantics, such as grammar, context, and basic associations. A base model provides a robust foundation of language understanding which can be adapted for specific tasks.

Base models in Huggingface are often named after the architecture they use, like bert-base-uncased, gpt2-medium, t5-base, etc.
- **Fine-tuned model:**

A fine-tuned model is a model that has undergone additional training (fine-tuning) on a smaller, task-specific dataset. This can include tasks like sentiment analysis, question answering, or domain-specific language understanding.

Fine-tuned models usually have additional descriptors in their names indicating the specific task or dataset they are fine-tuned for. For instance, **"bert-base-uncased-finetuned-squad"** is a BERT model fine-tuned on the SQuAD dataset for question answering, whereas **"bert-base-uncased"** is a base model.

More information can usually be found in the README or model description in the model repo.  
Inspecting the model's configuration or architecture can also give hints.

### Choosing which model to use
[https://huggingface.co/models](https://huggingface.co/models)

You may want to consider the following when choosing which model to use:
* Task Type
* Specific language (especially non-English languages)
* Model Size and Performance
* Fine-Tuning and Customization
* Community and Support
* Documentation and Examples
* Ethical Considerations
* Licensing and Cost


## Set up the tokenizer, load the model and perform inference, step by step.

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize the tokenizer for GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Prepare input text
input_text = "The capital of France is"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate attention mask
attention_mask = tokenizer(input_text, return_tensors="pt").attention_mask

# Set pad token ID if it's not already set
model.config.pad_token_id = model.config.eos_token_id

# Generate output
outputs = model.generate(input_ids, max_length=50)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated text:", generated_text)

**Do I need to look for the specific tokenizer and model classes for my tasks every time?**

In many cases, no. The architecture you want to use can be guessed from the name or the path of the pretrained model. Huggingface provides **AutoClasses** to help you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.


In [None]:
# NOTE: AutoModel will instantiate a base model class without a specific head, so we still need 
# a "relatively specific" class AutoModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM

# Initialize the tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Prepare input text
input_text = "The capital of France is"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate attention mask
attention_mask = tokenizer(input_text, return_tensors="pt").attention_mask

# Set pad token ID if it's not already set
model.config.pad_token_id = model.config.eos_token_id

# Generate output
outputs = model.generate(input_ids, max_length=50, attention_mask=attention_mask)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated text:", generated_text)

# Key outputs from a language model
- Logits: The raw, unnormalized scores for each vocabulary token at each _next_ position in the output sequence. By default, the model's forward pass returns the logits.
- Hidden States: Representations from each layer of the model. These are the activations of the model's neurons at each layer. Set `output_hidden_states=True` in the configuration or when calling the model to obtain Hidden States.
- Attentions: Attention weights from each layer of the model. These weights show how much each token in a sequence attends to every other token at each layer. Set `output_attentions=True` in the configuration or when calling the model to obtain Attentions.

In [None]:
from transformers import AutoTokenizer, AutoModel

model = AutoModel.from_pretrained("bert-base-cased")

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Prepare input text
input_text = "The capital of France is"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# get hidden state
outputs = model(input_ids)
print(outputs.last_hidden_state)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Initialize the tokenizer for GPT-2
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Load the pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Prepare input text
input_text = "The capital of France is"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate attention mask
attention_mask = tokenizer(input_text, return_tensors="pt").attention_mask

# Set pad token ID if it's not already set
model.config.pad_token_id = model.config.eos_token_id

# Generate output
outputs = model(input_ids, output_hidden_states=True, output_attentions=True)

print("logits:", outputs.logits)
print("Attentions:", outputs.attentions)

## **Exercise 3**: What are the dimensions of the output logits? What does each of them correspond to?

In [None]:
# **********************************************************
# YOUR CODE HERE
# **********************************************************

## **Exercise 4**: With what probability will the model continue its generation with "Paris"?

Hint: find out which token IDs the word "Paris" consists of, then look up their logits in the model output.  
To convert these into a normalized probability distribution per output step, we need to apply a softmax activation over the logits for the same output step.

In [None]:
# **********************************************************
# YOUR CODE HERE
# **********************************************************

# Configurations

## Model configuration
Hyperparameters to change a model's architecture.

In [None]:
from transformers import GPT2Model,GPT2Config

# Default configuration
model = GPT2Model.from_pretrained("gpt2")
model

In [None]:
model.config

In [None]:
# Create a custom configuration
config = GPT2Config(
    n_layer=6,
    n_head=8
)
# Load model with custom configuration
model = GPT2Model.from_pretrained("gpt2", config=config)
model

## Generating/Inference configuration

**Different decoding strategies**:

https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb

**Generation parameters**: 

https://huggingface.co/docs/transformers/v4.35.2/en/main_classes/text_generation#transformers.GenerationConfig


In [None]:
from transformers import pipeline
import torch
model = "gpt2"

pipeline = pipeline(
    "text-generation",
    model=model,
    trust_remote_code=True,
    torch_dtype=torch.float32
)

sequences = pipeline(
    'I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n',
    do_sample=True,
    top_k=20,
    pad_token_id=tokenizer.eos_token_id,
    temperature=1.0,
    max_length=50,
    num_return_sequences=3
)
for seq in sequences:
    print(f"Result: {seq["generated_text"]}\n")

# Next steps

You now know how to call language models using the `transformers` library, the steps that are taken under the hood, and have seen how models can be configured.

We will now fine-tune an LLM on a new dataset. For the next notebook (`notebook2/llama3_gpu_recommended.ipynb`), you will need a GPU.