## Getting the embedding of a word from the trained LLM
Requires the tokenizer "microsoft/Phi-3-mini-4k-instruct".


---



In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cpu",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.94M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

In [5]:
import torch

# Toeknize the given word
prompt = "germany"
input_ids = tokenizer(prompt, return_tensors="pt").to("cpu")
print("Tokens: ", input_ids)
print(tokenizer.decode(input_ids["input_ids"][0][0]))
print(tokenizer.decode(input_ids["input_ids"][0][1]))

Tokens:  {'input_ids': tensor([[22593,  1384]]), 'attention_mask': tensor([[1, 1]])}
germ
any


In [6]:
# Getting the embedding of the input_ids
# disables gradient calculation because it is not needed for inference
with torch.no_grad():
  outputs = model(**input_ids, output_hidden_states=True)
print(outputs)

CausalLMOutputWithPast(loss=None, logits=tensor([[[12.6250, 10.8125, 11.9375,  ...,  9.1875,  9.1875,  9.1875],
         [31.5000, 30.2500, 33.5000,  ..., 27.2500, 27.2500, 27.2500]]]), past_key_values=((tensor([[[[-2.6367e-01,  5.2002e-02, -4.9561e-02,  ...,  3.7842e-02,
           -2.3926e-01, -1.0156e-01],
          [ 7.7148e-02,  1.2109e-01, -1.3086e-01,  ..., -2.0020e-01,
            4.2383e-01,  2.5391e-01]],

         [[ 1.2500e-01, -2.0215e-01, -1.7773e-01,  ...,  3.7109e-02,
            8.0566e-02,  1.7773e-01],
          [-1.5039e-01,  1.4453e-01,  3.5156e-02,  ..., -4.7607e-02,
           -1.1963e-01,  1.6724e-02]],

         [[-7.3438e-01, -3.3984e-01,  6.5625e-01,  ...,  1.6211e-01,
           -7.9102e-02,  3.7305e-01],
          [-1.0645e-01, -5.4688e-01,  2.9102e-01,  ...,  7.8583e-04,
            1.6113e-01,  5.2734e-02]],

         ...,

         [[-9.2773e-02,  1.7480e-01, -8.5449e-02,  ...,  2.7588e-02,
            3.2422e-01,  5.6396e-02],
          [-4.2578e-01, -3

#Logits
The raw output of the model for each position in the sequence, shaped [batch_size, sequence_length, vocab_size].

Each row (for each token) is a vocabulary-sized vector of unnormalized probabilities. Typically, you’d pass these logits through softmax to get probabilities for predicting the next word/token.

#past_key_values
A long tuple of tensors that stores the key and value states for each attention layer.

#hidden_states
Another tuple of tensors that represent the intermediate hidden layer outputs from each layer of the transformer



In [14]:
# print(outputs.hidden_states)
# Last hidden state
last_hidden_state = outputs.hidden_states[-1]  # Shape: [batch_size, seq_len, hidden_dim]

# Extract the embedding for the word (2 tokens)
embedding = last_hidden_state[0, :]

print(embedding)


tensor([[ 0.3340,  0.6289,  0.6055,  ...,  2.3125,  0.9922,  0.4453],
        [-0.4609,  1.2969,  0.7500,  ...,  0.0248,  0.0149, -0.2871]],
       dtype=torch.bfloat16)


# Using pre-trained Word Embeddings


In [21]:
!pip uninstall -y numpy scikit-learn gensim accelerate
!pip install numpy>=1.25.2 scikit-learn>=1.5.0 gensim>=4.3.2 accelerate>=0.31.0

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: scikit-learn 1.6.1
Uninstalling scikit-learn-1.6.1:
  Successfully uninstalled scikit-learn-1.6.1
Found existing installation: gensim 4.3.3
Uninstalling gensim-4.3.3:
  Successfully uninstalled gensim-4.3.3
Found existing installation: accelerate 1.5.2
Uninstalling accelerate-1.5.2:
  Successfully uninstalled accelerate-1.5.2


In [1]:
import gensim.downloader as api

# Download embeddings (66MB, glove, trained on wikipedia, vector size: 50)
# Other options include "word2vec-google-news-300"
# More options at https://github.com/RaRe-Technologies/gensim-data
model = api.load("glove-wiki-gigaword-50")
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity([model['germany']], [model['berlin']]))
print(cosine_similarity([model['france']], [model['paris']]))

# Doing some arithmentic with the vectors
query = model['berlin'] - model['germany'] + model['france']
model.most_similar([query], topn=4)

[[0.7985422]]
[[0.802533]]


[('paris', 0.9114121794700623),
 ('france', 0.807920515537262),
 ('prohertrib', 0.7868965864181519),
 ('french', 0.7733355760574341)]

In [2]:
# Doing the same with using positive and negative arguments
# positive words contribute positively towards the similarity, negative words negatively
result = model.most_similar(positive=['france', 'berlin'], negative=['germany'], topn=4)
print(result)

[('paris', 0.9168643355369568), ('prohertrib', 0.7949979901313782), ('brussels', 0.7606425285339355), ('french', 0.7593673467636108)]


In [3]:
model.most_similar([model['paris']], topn=11)

[('paris', 1.0),
 ('prohertrib', 0.8611263036727905),
 ('france', 0.8025329113006592),
 ('brussels', 0.7796469926834106),
 ('amsterdam', 0.7769756317138672),
 ('french', 0.773611843585968),
 ('vienna', 0.7394115328788757),
 ('london', 0.7294350266456604),
 ('berlin', 0.7261149287223816),
 ('rome', 0.7099411487579346),
 ('strasbourg', 0.7078796029090881)]