Get word embeddings for each of the 180 concepts from the Periera dataset.
Using 4 LLMs:


1.   Llama-3.1-8B-Instruct (text-only)
2.   Llama-3.2-11B-Vision-Instruct (text-image)
3.   Phi-3.5-mini-instruct (text-only)
4.   Phi-3.5-vision-instruct (text-image)



In [1]:
# mount drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
# define word data
words_path = "/content/drive/MyDrive/CS375Final/stimuli_180concepts.txt"
with open (words_path, "r") as f:
  words = [line.strip() for line in f]

assert len(words) == 180
print(words[:10])

['ability', 'accomplished', 'angry', 'apartment', 'applause', 'argument', 'argumentatively', 'art', 'attitude', 'bag']


In [3]:
# dependencies for LLMs
!pip install torch transformers accelerate pandas

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
# huggingface access
!pip install huggingface_hub
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
The token `general_read` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `general_read

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# list of models
models = {
    "phi-3.5": "microsoft/Phi-3.5-mini-instruct",
    "phi-3.5-vision": "microsoft/Phi-3.5-vision-instruct",
    "llama-3.1": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "llama-3.2-vision": "meta-llama/Llama-3.2-11B-Vision-Instruct",
}

In [6]:
import numpy as np

# function to get embeddings
def get_embedding(model, tokenizer, word):
    inputs = tokenizer(word, return_tensors="pt")  # Tokenize the input
    inputs = {key: val.to("cpu") for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]  # Last layer hidden state
        word_embedding = last_hidden_state.mean(dim=1).squeeze().numpy()

    return word_embedding

In [7]:
# save embeddings for all words
embeddings = {word: {} for word in words}

In [9]:
import gc

for model_name in models:
  print(f"Processing for model {model_name}...")
  # load models and tokenizers one at a time
  tokenizer = AutoTokenizer.from_pretrained(models[model_name])

  # # for phi models
  # if "phi" in model_name:
  #   model_object = AutoModelForCausalLM.from_pretrained(models[model_name], trust_remote_code=True, attn_implementation).to("cpu")

  # # for llama models
  # else:
  #   model_object = AutoModelForCausalLM.from_pretrained(models[model_name], torch_dtype=torch.float16).to("cpu")

  model_object = AutoModelForCausalLM.from_pretrained(
        models[model_name],
        trust_remote_code=True,
        torch_dtype=torch.float16,
        _attn_implementation="eager"  # disable FlashAttention2
  ).to("cpu")

  # get and save embeddings
  for word in words:
    embeddings[word][model_name] = get_embedding(model_object, tokenizer, word)
  print(len(embeddings[word][model_name]))

  # delete and clear cache for memory constraints
  del model_object, tokenizer
  gc.collect()


Processing for model phi-3.5...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

3072
Processing for model phi-3.5-vision...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]



3072
Processing for model llama-3.1...


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

4096
Processing for model llama-3.2-vision...


tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/5.07k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/89.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

4096


In [10]:
# convert embeddings to a DataFrame
import pandas as pd

embedding_data = []
for word in words:
    row = {"Word": word}
    for model_name in models.keys():
        row[model_name] = ",".join(map(str, embeddings[word][model_name]))  # Store as comma-separated string
    embedding_data.append(row)

df = pd.DataFrame(embedding_data)

# Save CSV to Google Drive
csv_path = "/content/drive/MyDrive/CS375Final/word_embeddings.csv"
df.to_csv(csv_path, index=False)

print(f"Saved embeddings to {csv_path}")


Saved embeddings to /content/drive/MyDrive/CS375Final/word_embeddings.csv
