# Imports

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
project_dir = '/content/drive/MyDrive/Colab Notebooks/project/Music/skeleton/'
data_dir = project_dir + 'data/'

In [8]:
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# === Config ===
SONGS_CSV = 'songs.csv'
SCALER_PATH = 'scaler.joblib'
L_PATH = 'L.npy'
FEATURES_PATH = 'projected_features.npy'
INDEX_PATH = 'song_index.joblib'
N_COMPONENTS = 12  # adjust based on how many audio features you have

# === Step 1: Load songs.csv ===
df = pd.read_csv(data_dir+SONGS_CSV)
print(f"✅ Loaded {len(df)} songs.")

# === Step 2: Select feature columns ===
ignore_cols = ['song_id', 'artist', 'song', 'title', 'genre', 'emotion']
feature_cols = [col for col in df.columns if col not in ignore_cols and df[col].dtype in [np.float64, np.int64]]

X_raw = df[feature_cols].fillna(0).values
print(f"🧠 Using {len(feature_cols)} feature columns.")

# === Step 3: Standardize ===
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)
joblib.dump(scaler, SCALER_PATH)
print(f"💾 Saved scaler to {SCALER_PATH}")

# === Step 4: PCA projection ===
pca = PCA(n_components=min(N_COMPONENTS, X_scaled.shape[1]))
X_proj = pca.fit_transform(X_scaled)
np.save(L_PATH, pca.components_)
np.save(FEATURES_PATH, X_proj)
print(f"💾 Saved PCA matrix to {L_PATH} and projected features to {FEATURES_PATH}")

# === Step 5: Build NearestNeighbors model ===
nn = NearestNeighbors(n_neighbors=10, metric='euclidean')
nn.fit(X_proj)
joblib.dump(nn, INDEX_PATH)
print(f"🔍 Saved k-NN index to {INDEX_PATH}")

# === Optional: define a query function ===
def find_similar_songs(artist_name, song_name, top_n=5):
    # find the song in df
    query_idx = df[
        (df['artist'].str.lower() == artist_name.lower()) &
        (df['song'].str.lower() == song_name.lower())
    ].index

    if query_idx.empty:
        print("❌ Song not found.")
        return pd.DataFrame()

    query_vector = X_proj[query_idx[0]].reshape(1, -1)
    distances, indices = nn.kneighbors(query_vector, n_neighbors=top_n)

    similar = df.iloc[indices[0]].copy()
    similar['distance'] = distances[0]
    return similar[['artist', 'song', 'distance'] + feature_cols]

# === Example ===
print(find_similar_songs("coldplay", "yellow"))


✅ Loaded 236969 songs.
🧠 Using 18 feature columns.
💾 Saved scaler to scaler.joblib
💾 Saved PCA matrix to L.npy and projected features to projected_features.npy
🔍 Saved k-NN index to song_index.joblib
                    artist               song  distance  variance  \
3276              Coldplay             Yellow  0.000000  0.243056   
72077   Tyler, The Creator             YELLOW  1.513626  0.833514   
204041            Coldplay             Yellow  1.513626  0.833514   
13996           Nickelback              Truck  1.617298  0.001515   
45581           Nickelback  How You Remind Me  1.651055 -0.046867   

        release date  tempo  loudness  popularity  energy  danceability  ...  \
3276            2000    173     -7.23          92      66            43  ...   
72077           2000    173     -7.23          90      66            43  ...   
204041          2000    173     -7.23          90      66            43  ...   
13996           2001    172     -5.04          85      76        