In [None]:
!pip install transformers huggingface_hub
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes

In [13]:
import torch
import torch
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from google.colab import userdata

In [9]:
df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")
df.dropna(inplace=True)

In [10]:
# only keep the columns we need - album_name, track_name, artists, popularity, track_genre
df = df[["album_name", "track_name", "artists", "popularity", "track_genre"]]

# only keep the top 10 genres
top_genres = df["track_genre"].value_counts().head(5).index
df = df[df["track_genre"].isin(top_genres)]

unique_genres = df["track_genre"].unique()
num_genres = len(unique_genres)
unique_genres, num_genres

# randomly sample 40 tracks from each genre
df = df.groupby("track_genre").apply(lambda x: x.sample(40)).reset_index(drop=True)


# create a new column which contains the concatenation of the album_name, track_name and artists in format "track name: {track_name} | album name: {album_name} | artists: {artists}"
df["track_info"] = df.apply(lambda x: f"track name: {x['track_name']} | album name: {x['album_name']} | artists: {x['artists']}", axis=1)

df.head()

Unnamed: 0,album_name,track_name,artists,popularity,track_genre,track_info
0,Bookmarks,What If,Five For Fighting,55,acoustic,track name: What If | album name: Bookmarks | ...
1,Crustfall,I Wanna See It Burn,Days N Daze;Juicy Karkass,29,acoustic,track name: I Wanna See It Burn | album name: ...
2,Indiana,Beautiful Disaster,Jon McLaughlin,52,acoustic,track name: Beautiful Disaster | album name: I...
3,Nightshade,You're the Sea,Andrew Belle,56,acoustic,track name: You're the Sea | album name: Night...
4,Heaven (Acoustic),Heaven - Acoustic,Grace George,54,acoustic,track name: Heaven - Acoustic | album name: He...


In [14]:
login(userdata.get('HF_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Load the LLaMA 3 model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModel.from_pretrained(model_id, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Define the input sentence
sentence = "Tell me about Paris."

# Tokenize the input sentence
inputs = tokenizer(sentence, return_tensors="pt")

# Perform a forward pass through the model
with torch.no_grad():
    outputs = model(**inputs)

# Extract all hidden states
hidden_states = outputs.hidden_states  # This is a tuple of hidden states from all layers

# Extract embeddings from the first, middle, and last layers
first_layer_embeddings = hidden_states[1]  # First hidden layer (index 1, as index 0 is the input embeddings)
middle_layer_index = len(hidden_states) // 2
middle_layer_embeddings = hidden_states[middle_layer_index]  # Middle hidden layer
last_layer_embeddings = hidden_states[-1]  # Last hidden layer

# For each layer, extract the embedding of the final token in the sequence
final_token_first_layer = first_layer_embeddings[:, -1, :]
final_token_middle_layer = middle_layer_embeddings[:, -1, :]
final_token_last_layer = last_layer_embeddings[:, -1, :]

# Print out the shape of the extracted embeddings
print("First layer embedding shape:", final_token_first_layer.shape)
print("Middle layer embedding shape:", final_token_middle_layer.shape)
print("Last layer embedding shape:", final_token_last_layer.shape)
