#**DATASET LOADING**

In [27]:
import kagglehub

path = kagglehub.dataset_download("yamaerenay/spotify-dataset-1921-2020-160k-tracks")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/spotify-dataset-1921-2020-160k-tracks


In [28]:
import pandas as pd
import numpy as np

In [29]:
df =pd.read_csv("/kaggle/input/spotify-dataset-1921-2020-160k-tracks/data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170653 entries, 0 to 170652
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   valence           170653 non-null  float64
 1   year              170653 non-null  int64  
 2   acousticness      170653 non-null  float64
 3   artists           170653 non-null  object 
 4   danceability      170653 non-null  float64
 5   duration_ms       170653 non-null  int64  
 6   energy            170653 non-null  float64
 7   explicit          170653 non-null  int64  
 8   id                170653 non-null  object 
 9   instrumentalness  170653 non-null  float64
 10  key               170653 non-null  int64  
 11  liveness          170653 non-null  float64
 12  loudness          170653 non-null  float64
 13  mode              170653 non-null  int64  
 14  name              170653 non-null  object 
 15  popularity        170653 non-null  int64  
 16  release_date      17

#**DATA PREPROCESSING**

In [1]:
feature_cols = [
    "valence", "acousticness", "danceability", "energy",
    "instrumentalness", "liveness", "loudness", "speechiness", "tempo"
]

In [2]:
df = df.drop_duplicates(subset=["id"]).reset_index(drop=True)

NameError: name 'df' is not defined

In [None]:
df = df.dropna(subset=feature_cols).reset_index(drop=True)

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()
X_songs = scaler.fit_transform(df[feature_cols])


songs_df = df[["id", "name", "artists", "year"]].copy()

print("Shape of features:", X_songs.shape)
print("Shape of song catalog:", songs_df.shape)

print(songs_df.head(3))

In [None]:
songs_df = df[["id", "name", "artists", "year", "popularity",
               "duration_ms", "explicit", "key", "mode"]].copy()

print(songs_df.head(3))
print("Catalog shape after enrichment:", songs_df.shape)
songs_df.info()


In [None]:
!pip install -q sentence-transformers

from sentence_transformers import SentenceTransformer
import numpy as np

text_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

def get_text_embedding(text: str):
    emb = text_model.encode(text, normalize_embeddings=True)
    return emb

user_text = "calm and reflective"
e_text = get_text_embedding(user_text)

print("Embedding shape:", e_text.shape)
print("First 5 dims:", e_text[:5])

In [None]:
!pip install -q transformers ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git

import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def get_image_embedding(image_path: str):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        emb = clip_model.encode_image(image)
    emb = emb / emb.norm(dim=-1, keepdim=True)  # normalize
    return emb.cpu().numpy().flatten()

# from google.colab import files
# uploaded = files.upload()
# e_img = get_image_embedding(list(uploaded.keys())[0])


In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from tqdm import tqdm


texts = (df["name"] + " by " + df["artists"]).tolist()

text_embeddings = np.vstack([get_text_embedding(t) for t in tqdm(texts, desc="Processing text embeddings")])

print("Shape of text embeddings:", text_embeddings.shape)
print("Shape of audio features:", X_songs.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    text_embeddings, X_songs, test_size=0.2, random_state=42
)

In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, Y_train)

print("Training complete.")


In [None]:
from sklearn.metrics import mean_squared_error

Y_pred = ridge.predict(X_test)
mse = mean_squared_error(Y_test, Y_pred)
print("Test MSE:", mse)

In [None]:

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def recommend_from_text(user_text, top_k=10):
    e_text = get_text_embedding(user_text).reshape(1, -1)

    vibe_vector = ridge.predict(e_text)  # shape (1, 9)

    sims = cosine_similarity(vibe_vector, X_songs)[0]  # flatten

    top_idx = np.argsort(sims)[::-1][:top_k]

    results = songs_df.iloc[top_idx].copy()
    results["similarity"] = sims[top_idx]

    return results.reset_index(drop=True)

user_query = "calm and reflective"
recommendations = recommend_from_text(user_query, top_k=5)
print(recommendations)