<a href="https://colab.research.google.com/github/pinzger/handsonllms/blob/main/Tokens%20and%20Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Tokens and Embeddings

Example code covers:
   * Using the tokenizer "microsoft/Phi-3-mini-4k-instruct".
   * Comparison of the output of existing tokenizers.
   * Getting the embeddings from an existing LLM.
   * Getting the embeddings with "glove-wiki-gigaword-50".
   * Training a Word2Vec model for recommending songs.
   
Examples adopted from Chapter 2 of [Hands-On Large Language Models](https://www.amazon.com/Hands-Large-Language-Models-Understanding/dp/1098150961).

---

ðŸ’¡ **NOTE**: For using a GPU in Google Colab, go to
**Runtime > Change runtime type > Hardware accelerator > GPU > GPU type > T4**.

---

If you are viewing this notebook on Google Colab (or any other cloud vendor), you might need to **uncomment and run** the following codeblock to install the dependencies for this chapter:

In [None]:
# %%capture
!pip install gensim>=4.3.2 scikit-learn>=1.5.0 accelerate>=0.31.0

# Using a tokenizer

Note that we load the model and tokenizer separately and keep them as such so that we can explore them separately.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

In [None]:
prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda") # "pt" = return PyTorch tensor object:(s)
print(input_ids)


In [None]:
print(tokenizer.decode(input_ids["input_ids"][0]))

In [None]:
#prompt = "Write an email apologizing to Sarah for the tragic gardening mishap. Explain how it happened.<|assistant|>"

# Tokenize the input prompt
#input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate the text
output_ids = model.generate(
 input_ids=input_ids["input_ids"],
  max_new_tokens=100,
  num_beams=5,
  early_stopping=True
)

# Print the output
print(tokenizer.decode(output_ids[0]))

In [None]:
print(output_ids[0])

In [None]:
print(tokenizer.decode(14350))
print(tokenizer.decode(3323))
print(tokenizer.decode(622))
print(tokenizer.decode([3323, 622]))
print(tokenizer.decode(29901))

# Comparing Trained LLM Tokenizers


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

colors_list = [
    '102;194;165', '252;141;98', '141;160;203',
    '231;138;195', '166;216;84', '255;217;47'
]

def show_tokens(sentence, tokenizer_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    token_ids = tokenizer(sentence).input_ids
    for idx, t in enumerate(token_ids):
        print(
            f'\x1b[0;30;48;2;{colors_list[idx % len(colors_list)]}m' +
            tokenizer.decode(t) +
            '\x1b[0m',
            end=' '
        )

In [None]:
text = """
English and CAPITALIZATION
ðŸŽµ é¸Ÿ
show_tokens False None elif == >= else: two tabs:"    " Three tabs: "       "
12.0*50=600
"""

In [None]:
show_tokens(text, "bert-base-uncased")

In [None]:
show_tokens(text, "bert-base-cased")

In [None]:
show_tokens(text, "gpt2")

In [None]:
show_tokens(text, "google/flan-t5-small")

In [None]:
# The official is `tiktoken` but this the same tokenizer on the HF platform
show_tokens(text, "Xenova/gpt-4")

In [None]:
# You need to request access before being able to use this tokenizer
show_tokens(text, "bigcode/starcoder2-15b")

In [None]:
show_tokens(text, "facebook/galactica-1.3b")

In [None]:
show_tokens(text, "microsoft/Phi-3-mini-4k-instruct")

## Getting the embedding of a word from the trained LLM
Requires the tokenizer "microsoft/Phi-3-mini-4k-instruct".


In [None]:
import torch

# Toeknize the given word
prompt = "germany"
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print("Tokens: ", input_ids)
print(tokenizer.decode(input_ids["input_ids"][0][0]))
print(tokenizer.decode(input_ids["input_ids"][0][1]))

# Getting the embedding of the input_ids
# disables gradient calculation because it is not needed for inference
with torch.no_grad():
  outputs = model(**input_ids, output_hidden_states=True)

# Last hidden state
last_hidden_state = outputs.hidden_states[-1]  # Shape: [batch_size, seq_len, hidden_dim]

# Extract the embedding for the word (2 tokens)
embedding = last_hidden_state[0, :]

print(embedding)

# Different approach
# Extract embeddings for the first token (excluding special tokens)
# token_embeddings = last_hidden_state[:, 1:-1, :]  # Exclude [CLS] and [SEP] if present
# print(token_embeddings)

# Example: Get the embedding for the first word
#word_embedding = token_embeddings[0, 0, :]  # Shape: [hidden_dim]



# Using pre-trained Word Embeddings


In [None]:
import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")

In [None]:
print(model['king'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([model['germany']], [model['berlin']]))
print(cosine_similarity([model['france']], [model['paris']]))

# Doing some arithmentic with the vectors
query = model['berlin'] - model['germany'] + model['france']
model.most_similar([query], topn=4)

In [None]:
# Doing the same with using positive and negative arguments
# positive words contribute positively towards the similarity, negative words negatively
result = model.most_similar(positive=['france', 'berlin'], negative=['germany'], topn=4)
print(result)

In [None]:
model.most_similar([model['paris']], topn=11)

In [None]:
# Note, takes quite some time to download
modelw2v = api.load("word2vec-google-news-300")

# Recommending songs by embeddings
Based on the assumption that songs that appear together in a playlist are similar. Each playlist is treated like a sentence.

In [None]:
import pandas as pd
from urllib import request

# Get the playlist dataset file
data = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/train.txt')

# Parse the playlist dataset file. Skip the first two lines as
# they only contain metadata
lines = data.read().decode("utf-8").split('\n')[2:]

# Remove playlists with only one song
playlists = [s.rstrip().split() for s in lines if len(s.split()) > 1]

# Load song metadata
songs_file = request.urlopen('https://storage.googleapis.com/maps-premium/dataset/yes_complete/song_hash.txt')
songs_file = songs_file.read().decode("utf-8").split('\n')
songs = [s.rstrip().split('\t') for s in songs_file]
songs_df = pd.DataFrame(data=songs, columns = ['id', 'title', 'artist'])
songs_df = songs_df.set_index('id')

In [None]:
print( 'Playlist #1:\n ', playlists[0], '\n')
print( 'Playlist #2:\n ', playlists[1])

In [None]:
from gensim.models import Word2Vec

# Train our Word2Vec model
# each entry in the playlist is a sentence (list of words)
# vector_size = dim. of the resulting feature vectors
# window = max. distance between current and predicted word within a sentence
# negative = use n noise words for negative sample
# min_count = ignore all words with lower frequency than n
# workers = use n threads for training
model = Word2Vec(
    playlists, vector_size=32, window=20, negative=20, min_count=1, workers=4
)

In [None]:
song_id = 2172

# Ask the model for songs similar to song #2172
model.wv.most_similar(positive=str(song_id))

In [None]:
print(songs_df.iloc[2172])

In [None]:
import numpy as np

def print_recommendations(song_id):
    similar_songs = np.array(
        model.wv.most_similar(positive=str(song_id),topn=5)
    )[:,0]
    return  songs_df.iloc[similar_songs]


In [None]:
# Print recommendations for song 2172 from Metallica (heavy metal)
print_recommendations(2172)

In [None]:
# Print recommendations for song 842 from 2Pac (rap)
print(songs_df.iloc[842])
print_recommendations(842)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Computing similarities usting cosing similarity
# similar songs
print(cosine_similarity([model.wv['842']], [model.wv['412']]))
print(cosine_similarity([model.wv['842']], [model.wv['5828']]))

# different songs
print(cosine_similarity([model.wv['842']], [model.wv['2070']]))

# model.wv['842']