In [None]:
!pip install transformers huggingface_hub
!pip install git+https://github.com/huggingface/transformers.git
!pip install git+https://github.com/huggingface/accelerate.git
!pip install bitsandbytes

In [13]:
import torch
import torch
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import pandas as pd
from huggingface_hub import login
from google.colab import userdata

In [9]:
df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")
df.dropna(inplace=True)

In [10]:
# only keep the columns we need - album_name, track_name, artists, popularity, track_genre
df = df[["album_name", "track_name", "artists", "popularity", "track_genre"]]

# only keep the top 10 genres
top_genres = df["track_genre"].value_counts().head(5).index
df = df[df["track_genre"].isin(top_genres)]

unique_genres = df["track_genre"].unique()
num_genres = len(unique_genres)
unique_genres, num_genres

# randomly sample 40 tracks from each genre
df = df.groupby("track_genre").apply(lambda x: x.sample(40)).reset_index(drop=True)


# create a new column which contains the concatenation of the album_name, track_name and artists in format "track name: {track_name} | album name: {album_name} | artists: {artists}"
df["track_info"] = df.apply(lambda x: f"track name: {x['track_name']} | album name: {x['album_name']} | artists: {x['artists']}", axis=1)

df.head()

Unnamed: 0,album_name,track_name,artists,popularity,track_genre,track_info
0,Bookmarks,What If,Five For Fighting,55,acoustic,track name: What If | album name: Bookmarks | ...
1,Crustfall,I Wanna See It Burn,Days N Daze;Juicy Karkass,29,acoustic,track name: I Wanna See It Burn | album name: ...
2,Indiana,Beautiful Disaster,Jon McLaughlin,52,acoustic,track name: Beautiful Disaster | album name: I...
3,Nightshade,You're the Sea,Andrew Belle,56,acoustic,track name: You're the Sea | album name: Night...
4,Heaven (Acoustic),Heaven - Acoustic,Grace George,54,acoustic,track name: Heaven - Acoustic | album name: He...


In [14]:
login(userdata.get('HF_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Load the LLaMA 3 model and tokenizer
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModel.from_pretrained(model_id, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_id)


In [None]:
# for every track_info, get the embeddings and store then in a new column called first_layer_embedding, middle_layer_embedding, last_layer_embedding
def get_embeddings(track_info):
    inputs = tokenizer(track_info, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    hidden_states = outputs.hidden_states
    first_layer_embeddings = hidden_states[1]
    middle_layer_index = len(hidden_states) // 2
    middle_layer_embeddings = hidden_states[middle_layer_index]
    last_layer_embeddings = hidden_states[-1]
    final_token_first_layer = first_layer_embeddings[:, -1, :]
    final_token_middle_layer = middle_layer_embeddings[:, -1, :]
    final_token_last_layer = last_layer_embeddings[:, -1, :]
    return final_token_first_layer, final_token_middle_layer, final_token_last_layer

df[["first_layer_embedding", "middle_layer_embedding", "last_layer_embedding"]] = df["track_info"].apply(get_embeddings).apply(pd.Series)

In [None]:
# save the dataframe to a pickle file
df.to_pickle("spotify_tracks_embeddings.pkl")

In [2]:
import pandas as pd

In [3]:
# load spotify_tracks_embeddings.pkl
df = pd.read_pickle("spotify_tracks_embeddings.pkl")
df.head()

Unnamed: 0,album_name,track_name,artists,popularity,track_genre,track_info,first_layer_embedding,middle_layer_embedding,last_layer_embedding
0,Darkness Within,Darkness Within,Michael Logen,33,acoustic,track name: Darkness Within | album name: Dark...,"[[tensor(0.0074, dtype=torch.float16), tensor(...","[[tensor(-0.1501, dtype=torch.float16), tensor...","[[tensor(-2.2715, dtype=torch.float16), tensor..."
1,ショッピング,アジアの純真,Yosui Inoue;Tamio Okuda,32,acoustic,track name: アジアの純真 | album name: ショッピング | arti...,"[[tensor(-6.2943e-05, dtype=torch.float16), te...","[[tensor(-0.1089, dtype=torch.float16), tensor...","[[tensor(-0.1013, dtype=torch.float16), tensor..."
2,Arcade,Arcade,Andrew Foy;Renee Foy,37,acoustic,track name: Arcade | album name: Arcade | arti...,"[[tensor(-0.0021, dtype=torch.float16), tensor...","[[tensor(-0.0660, dtype=torch.float16), tensor...","[[tensor(-1.2715, dtype=torch.float16), tensor..."
3,Here With Me (Acoustic),Here With Me - Acoustic,Daniel Robinson,55,acoustic,track name: Here With Me - Acoustic | album na...,"[[tensor(0.0126, dtype=torch.float16), tensor(...","[[tensor(-0.1462, dtype=torch.float16), tensor...","[[tensor(-1.6318, dtype=torch.float16), tensor..."
4,Violent Femmes,Good Feeling,Violent Femmes,47,acoustic,track name: Good Feeling | album name: Violent...,"[[tensor(-0.0027, dtype=torch.float16), tensor...","[[tensor(0.1115, dtype=torch.float16), tensor(...","[[tensor(-0.7964, dtype=torch.float16), tensor..."


In [6]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# define the features and target
features = ["first_layer_embedding", "middle_layer_embedding", "last_layer_embedding"]
target = "popularity"

# define the models
models = {
    "first_layer_embedding": LinearRegression(),
    "middle_layer_embedding": LinearRegression(),
    "last_layer_embedding": LinearRegression(),
}

# train the models
for layer, model in models.items():
    # Extract and reshape the embeddings
    X = pd.DataFrame(df[layer].apply(lambda x: x.reshape(-1)).tolist())
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print(f"{layer} MSE: {mse}")

first_layer_embedding MSE: 6327.303394520869
middle_layer_embedding MSE: 435.88717599995124
last_layer_embedding MSE: 328.398661831934


In [8]:
# train classifier to predict genre from embeddings, do this for all 3 layers

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd

# define the features and target
features = ["first_layer_embedding", "middle_layer_embedding", "last_layer_embedding"]
target = "track_genre"

# define the models
models = {
    "first_layer_embedding": LogisticRegression(),
    "middle_layer_embedding": LogisticRegression(),
    "last_layer_embedding": LogisticRegression(),
}

# train the models
for layer, model in models.items():
    # Extract and reshape the embeddings
    X = pd.DataFrame(df[layer].apply(lambda x: x.reshape(-1)).tolist())
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{layer} Accuracy: {acc}")

first_layer_embedding Accuracy: 0.3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


middle_layer_embedding Accuracy: 0.775
last_layer_embedding Accuracy: 0.875


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
