<a href="https://colab.research.google.com/github/maanvic123/cs372-project/blob/main/final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Notebook Configuration**
1.   Install dependencies and libraries.
2.   Load input files and dataset.

In [None]:
# install dependencies and libraries

!pip install -q openai faiss-cpu numpy joblib requests tqdm h5py

In [None]:
# get API keys

from google.colab import userdata

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
SPOTIFY_CLIENT_ID = userdata.get('SPOTIFY_CLIENT_ID')
SPOTIFY_CLIENT_SECRET = userdata.get('SPOTIFY_CLIENT_SECRET')

In [None]:
# mount google drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# clone and pull code files from git repo

import os
from pathlib import Path

REPO_DIR = Path("/content/project")
REPO_URL = "https://github.com/maanvic123/cs372-project.git"

if not REPO_DIR.exists():
  !git clone {REPO_URL} {REPO_DIR}
else:
  %cd {REPO_DIR}
  !git pull

/content/project
Already up to date.


In [None]:
# load data and preprocessing files from git repo and google drive
from pathlib import Path
import os, shutil, joblib
import numpy as np
import pandas as pd

# set filepaths
COLAB_DIR = Path('/content/project')

DRIVE_DIR = Path("/content/drive/MyDrive/DUKE | 2022 - 2026/2526 -- senior/CS 372 - Intro to Applied Machine Learning/372 final project")
raw_data_csv = Path(DRIVE_DIR / "raw_spotify_data.csv")

ARTIFACTS_DIR = Path("/content/project/data/processed")
features_path = Path(ARTIFACTS_DIR / "features.npy")
track_ids_path = Path(ARTIFACTS_DIR / "track_ids.npy")
scaler_path = Path(ARTIFACTS_DIR / "scaler.pkl")

# load raw csv data file
df_raw = pd.read_csv(raw_data_csv)
print("raw dataset loaded")

# load preprocessing artifacts
X = np.load(features_path)
track_ids = np.load(track_ids_path, allow_pickle=True).astype(str)
scaler = joblib.load(str(scaler_path))
print("preprocessing artifacts loaded")

# check files
print("X shape:", X.shape)
print("num track IDs:", len(track_ids))

raw dataset loaded
preprocessing artifacts loaded
X shape: (1157421, 14)
num track IDs: 1157421


# **Create Textual Embeddings of Songs**
1.   Convert numerical audio features to words based on ranges.
2.   Add "description" column to concatenate all text audio features.
3. Use OpenAI "text-embedding-3-small" model to create textual embeddings of songs.



In [None]:
# build description column using audio features - map numerical values to textual descriptions

from math import isnan

# valence (0.0-1.0) - measure of track's happiness or positiveness
def valence_word(v):
  if v is None: return "neutral"
  v = float(v)
  if v < 0.25: return "very sad"
  if v < 0.45: return "sad"
  if v < 0.55: return "netural"
  if v < 0.75: return "happy"
  return "very happy"

# energy (0.0-1.0) - track's perceptual intensity, loudness, and activity
def energy_word(e):
  if e is None: return "moderate energy"
  e = float(e)
  if e < 0.25: return "calm"
  if e < 0.5: return "relaxed"
  if e < 0.75: return "energetic"
  return "very energetic"

# danceability (0.0-1.0) - how suitable a track is for dancing, based on tempo, beat strength, stability, and regularity
def danceability_word(d):
  if d is None: return "unknown danceability"
  d = float(d)
  if d < 0.35: return "not very danceable"
  if d < 0.7: return "danceable"
  return "highly danceable"

# acoustic (0.0-1.0) - how likely a track is to be acoustic (natural, organic sounds like guitars, piano) versus electronic
def acousticness_word(a):
  if a is None: return "mixed"
  a = float(a)
  if a > 0.7: return "acoustic"
  if a > 0.4: return "mostly acoustic"
  return "electronic or produced"

# tempo (BPM) - speed or pace of the track
def tempo_word(t):
  if t is None: return "moderate tempo"
  t = float(t)
  if t < 80: return "slow tempo"
  if t < 120: return "mid tempo"
  return "fast tempo"


# compute word columns for audio features
df_raw["valence_word"] = df_raw["valence"].apply(valence_word)
df_raw["energy_word"] = df_raw["energy"].apply(energy_word)
df_raw["danceability_word"] = df_raw["danceability"].apply(danceability_word)
df_raw["acousticness_word"] = df_raw["acousticness"].apply(acousticness_word)
df_raw["tempo_word"] = df_raw["tempo"].apply(tempo_word)


# put words together to create textual description for song
def write_description(track):
  desc_parts = []

  # get and append textual features to description
  title = str(track.get("track_name", "")).strip()
  artist = str(track.get("artist_name", "")).strip()
  genre = str(track.get("genre", "")).strip()

  desc_parts.append(f"{title} by {artist} is a track")
  if genre != "":
    desc_parts.append(f"from the {genre} genre")

  # get and append audio features to description
  valence_word = str(track.get("valence_word", ""))
  energy_word = str(track.get("energy_word", ""))
  danceability_word = str(track.get("danceability_word", ""))
  acousticness_word = str(track.get("acousticness_word", ""))
  tempo_word = str(track.get("tempo_word", ""))

  desc_parts.append("that feels")
  desc_parts.append(f"{valence_word}, {energy_word}, {danceability_word}, {acousticness_word}, {tempo_word}")

  return " ".join(desc_parts)


# compute and add description column to dataset
df_raw["description"] = df_raw.apply(write_description, axis=1)

# save computed description texts to npy array
description_map = dict(zip(df_raw["track_id"].astype(str), df_raw["description"].astype(str)))
description_texts = [description_map.get(tid, str(tid)) for tid in track_ids.tolist()]
np.save(Path(DRIVE_DIR / "description_texts.npy"), np.array(description_texts, dtype=object))

print(f"computed and wrote {len(description_texts)} descriptions for songs in dataset.")
print("examples:", description_texts[:3])

computed and wrote 1157421 descriptions for songs in dataset.
examples: ["I Won't Give Up by Jason Mraz is a track from the acoustic genre that feels very sad, relaxed, danceable, mostly acoustic, fast tempo", '93 Million Miles by Jason Mraz is a track from the acoustic genre that feels netural, relaxed, danceable, mostly acoustic, fast tempo', 'Do Not Let Me Go by Joshua Hyslop is a track from the acoustic genre that feels very sad, calm, danceable, electronic or produced, fast tempo']


In [None]:
# create textual embeddings of songs with OpenAI text-embedding-3-small model

import openai, time, math
from google.colab import files
from tqdm import tqdm
import numpy as np
import h5py

# init openAI client
client = openai.OpenAI(api_key=OPENAI_API_KEY)

# set openAI textual embedding model and hyperparameters
EMBEDDING_MODEL = "text-embedding-3-small"
OUT_EMBEDDING_PATH = COLAB_DIR / 'song_text_embeddings.h5'
BATCH_SIZE = 512
EMBEDDING_DIM = 1536

# load description_texts file
description_texts = np.load(Path(DRIVE_DIR / "description_texts.npy"), allow_pickle=True)


# function to get textual embeddings for 1 batch - batch_texts: list[str] -> numpy array (len(batch_texts), EMBEDDING_DIM)
def get_openai_embedding(batch_texts):
  # Use the new client.embeddings.create syntax
  res = client.embeddings.create(model=EMBEDDING_MODEL, input=batch_texts)
  return [item.embedding for item in res.data]


# write embeddings to h5 file if not already computed
if not OUT_EMBEDDING_PATH.exists():
  n = len(description_texts)

  with h5py.File(str(OUT_EMBEDDING_PATH), "w") as f:
    # create dataset
    dset = f.create_dataset(
      "embeddings",
      shape=(n, EMBEDDING_DIM),
      dtype="float32",
      compression="gzip",
      chunks=(min(BATCH_SIZE, n), EMBEDDING_DIM))

    # embedding loop
    for i in tqdm(range(0, n, BATCH_SIZE), desc="Embedding"):
      # define and get embedding for batch
      batch = description_texts[i : i + BATCH_SIZE]
      batch_embeddings = get_openai_embedding(batch)
      batch_arr = np.array(batch_embeddings, dtype="float32")

      # write batch embedding to dataset
      dset[i : i + batch_arr.shape[0], :] = batch_arr

# all embeddings computed
print("finished all embeddings!")

# print shape of embeddings (1157421 song embeddings, 1536 embedding dimension)
with h5py.File(OUT_EMBEDDING_PATH, 'r') as f:
  if 'embeddings' in f:
    dataset = f['embeddings']
    print(f"shape of dataset of song embeddings: {dataset.shape}")

    # verify embeddings
    print(f"example embedding: {dataset[0]}")

# download embeddings file to local computer
files.download(OUT_EMBEDDING_PATH)
print("embeddings downloaded to computer")

finished all embeddings!
shape of dataset of song embeddings: (1157421, 1536)
example embedding: [-0.0170249   0.04692689 -0.02582091 ... -0.01914884  0.03994346
 -0.01601297]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

embeddings downloaded to computer


# **Similarity Search (FAISS)**
*Find vectors (tracks) with highest similarity to query vector (user-inputted mood embedding) from 1M+ vectors (tracks) using FAISS (Facebook AI Similarity Search).*

1. After L2-normalizing song numerical vectors, cosine similarity is the same as dot product: cosine(x,y) = dot(x,y). FAISS supports fast, efficient dot product computation.
2. FAISS organizes songs into groups ("centroids") based on vector similarity.
3. Given user-inputted mood embedding, FAISS searches within the most relevant groups for specific song vectors to output as recommendations.

In [None]:
# compute L2 normalizations of song embeddings in blocks of vectors

import h5py
import numpy as np
from pathlib import Path
import cupy as cp

EMBEDDINGS_PATH = COLAB_DIR / 'song_text_embeddings.h5'

# block size - load 10k vectors at a time for normalization
BLOCK_SIZE = 10000

with h5py.File(str(EMBEDDINGS_PATH), "r+") as f:
  # get embeddings dataset from h5py file
  dset = f["embeddings"]
  N, D = dset.shape

  # normalize vectors in blocks of BLOCK_SIZE
  for i in range(0, N, BLOCK_SIZE):
    # load block of vectors from dataset and move to GPU
    j = min(N, i + BLOCK_SIZE)
    X = dset[i:j].astype(np.float32)
    X = cp.asarray(X)

    # calculate L2 norm for each vector in block
    norms = cp.linalg.norm(X, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    X = X / norms

    # move normalized block back to CPU and write to dataset
    X = cp.asnumpy(X)
    dset[i:j] = X

  f.flush()

print("embeddings L2-normalized successfully")

# verify normalizations (all should be 1.0)
with h5py.File(str(EMBEDDINGS_PATH), "r") as f:
  print(f"norm of first embedding after normalization: {np.linalg.norm(f["embeddings"][0]):.2f}")

# download normalized embeddings to computer
files.download(EMBEDDINGS_PATH)
print("normalized embeddings downloaded to computer")

embeddings L2-normalized successfully
norm of first embedding after normalization: 1.00


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

normalized embeddings downloaded to computer


In [None]:
# train FAISS clustering model to efficiently find similar songs
# 1. randomly sample some normalized embedding vectors to learn embedding space (identify centroids, how to divide embedding space)
# 2. build FAISS index (IndexIVFFlat) for approx nearest neighbor (ANN) search: for each vector, find and store in appropriate centroid list

import faiss
import numpy as np
from pathlib import Path
import h5py

# set file paths
INDEX_PATH = COLAB_DIR / "faiss_ivf_flat.index"

# FAISS clustering model training parameters
nlist = 8192              # number of coarse clusters (centroids, groups)
nprobe = 16               # how many clusters to search at query time
TRAINING_SAMPLE = 100000  # training sample size (100k)
BATCH_SIZE = 10000        # add vectors in 10k batches


with h5py.File(str(EMBEDDINGS_PATH), "r") as f:
  # load normalized embeddings
  dset = f["embeddings"]
  N, D = dset.shape

  # init inner product quantizer (to learn how to divide embedding space) & INVFFLat index (inverted file index with full vectors in each cluster list)
  quantizer = faiss.IndexFlatIP(D)
  index = faiss.IndexIVFFlat(quantizer, D, nlist, faiss.METRIC_INNER_PRODUCT)   # inner product ~ cosine similarity

  # randomly sample training vectors to train index
  random_seed = np.random.RandomState(seed=2025)
  selection = random_seed.choice(N, size=min(TRAINING_SAMPLE, N), replace=False)
  sample = np.empty((selection.shape[0], D), dtype=np.float32)
  for i, pos in enumerate(selection):
    sample[i] = dset[pos]
  print(f"training sample shape:", sample.shape)

  # train index with sample data - runs k-means to learn nlist centroids with quantizer
  index.train(sample)

  # add all vectors to index in batches
  for i in range(0, N, BATCH_SIZE):            # for each vector, FAISS finds nearest centroid and puts in inverted list
    j = min(N, i + BATCH_SIZE)
    vectors = dset[i:j].astype(np.float32)
    index.add(vectors)

  # set query time search parameters
  index.nprobe = nprobe

  # save trained FAISS index
  faiss.write_index(index, str(INDEX_PATH))
  print("FAISS index trained and saved to", INDEX_PATH)

  # download FAISS index to computer
  files.download(INDEX_PATH)
  print("FAISS index downloaded to computer")

training sample shape: (100000, 1536)
FAISS index trained and saved to /content/project/faiss_ivf_flat.index


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

FAISS index downloaded to computer
