In [None]:
#Program to load and preprocess Spotify audio features dataset

import pandas as pd
import numpy as np
import sklearn as sk


df = pd.read_csv('../data/raw/spotify_audio_features.csv')

# Strip any whitespace from column names (common issue)
df.columns = df.columns.str.strip()

# Chosen relevant features for analysis
features = ['key', 'tempo', 'acousticness', 'valence', 'instrumentalness',
            'mode', 'energy', 'danceability', 'loudness']

# Extract features
X = df[features].copy()

# Handle missing values
X = X.fillna(X.mean())

# Keep track info separate (for displaying results)
track_info = df[['track_name', 'artist_name']].copy()



In [20]:
# Normalising chosen features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Features normalized!")
print(f"Shape: {X_scaled.shape}")  


Features normalized!
Shape: (130663, 9)


In [21]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

def find_similar_songs(song_index, n_recommendations=10):
    """
    Find n most similar songs to the song at song_index
    
    Parameters:
    - song_index: Index of the song in the dataframe
    - n_recommendations: How many similar songs to return
    
    Returns:
    - DataFrame with similar songs and similarity scores
    """
    # Get the target song's features
    target_song = X_scaled[song_index].reshape(1, -1)
    
    # Calculate similarity with ALL songs in dataset
    # Cosine similarity: 1 = identical, 0 = completely different
    similarities = cosine_similarity(target_song, X_scaled)[0]
    
    # Get indices of most similar songs
    # [0] is the song itself (similarity = 1.0), so we skip it
    similar_indices = similarities.argsort()[::-1][1:n_recommendations+1]
    
    # Create results dataframe
    results = pd.DataFrame({
        'track_name': track_info.iloc[similar_indices]['track_name'].values,
        'artist_name': track_info.iloc[similar_indices]['artist_name'].values,
        'similarity_score': similarities[similar_indices]
    })
    
    return results

# Test it!
test_song_idx = 0
print(f"\nOriginal song: {track_info.iloc[test_song_idx]['track_name']} by {track_info.iloc[test_song_idx]['artist_name']}")
print("\nSimilar songs:")
print(find_similar_songs(test_song_idx, n_recommendations=5))


Original song: Big Bank feat. 2 Chainz, Big Sean, Nicki Minaj by YG

Similar songs:
                                         track_name  artist_name  \
0  BIG BANK (feat. 2 Chainz, Big Sean, Nicki Minaj)           YG   
1                                   Tokyo Snow Trip  Iggy Azalea   
2                                      Rapper Actor  Swank Davis   
3                                        Subjective        80vii   
4                                               4 U  Kuzu Mellow   

   similarity_score  
0          0.999952  
1          0.981181  
2          0.977490  
3          0.975043  
4          0.974014  
