# Lets Load and Run an LLM

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
#Load the model along with its tokenizer

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
prompt = "What is tokenization: <|assistant|>"

#lets tokenize
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")


In [None]:
print(input_ids)

In [None]:
for each_id in input_ids[0]:
  print(tokenizer.decode(each_id))

# Different Tokenization of various Pre-trained Models

In [None]:
def load_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  return tokenizer

def tokenize_sentence(tokenizer, input_sentence):
  token_ids = tokenizer(input_sentence).input_ids
  tokenized_sentence = tokenizer.convert_ids_to_tokens(token_ids)
  return tokenized_sentence

## BERT base uncased
Tokenization Method: WordPiece

In [None]:
model_id = "bert-base-uncased"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

## BERT base cased
Tokenization Method: WordPiece

In [None]:
model_id = "bert-base-cased"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

## GPT-2
Tokenization Method: Byte Pair Encoding (BPE)

In [None]:
model_id = "gpt2"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)

## Flan-t5
Tokenization Method: SentencePiece

In [None]:
model_id = "google/flan-t5-small"
tokenizer = load_tokenizer(model_id)
input_sentence = "I love Large Language Models. This is a tutorial on tokenization."
tokenized_sentence = tokenize_sentence(tokenizer, input_sentence)
print(tokenized_sentence)