In [40]:
# Import packages
import pandas as pd
import numpy as np
import librosa
import librosa.display
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import requests
from io import BytesIO

In [32]:
# Load dataset
df = pd.read_csv("./spotify_tracks_50.csv")
df.head()

Unnamed: 0,track_uri,album_name,album_uri,artist_name,artist_uri,duration_ms,pos,track_name,inside_playlists
0,spotify:track:4pLwZjInHj3SimIyN9SnOz,Dangerous Woman,spotify:album:4lVR2fg3DAUQpGVJ6DciHW,Ariana Grande,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,226160,0,Side To Side,[584466 290613 339107 ... 932309 757216 867665]
1,spotify:track:37f4ITSlgPX81ad2EvmVQr,Wildfire,spotify:album:0mFDIOqypzHp6Xd0el1hoT,Rachel Platten,spotify:artist:3QLIkT4rD2FMusaqmkepbq,204013,1,Fight Song,[584466 290613 540271 ... 535890 217688 26462]
2,spotify:track:6i0V12jOa3mr6uu4WYhUBr,Heathens,spotify:album:3J8W9AOjQhnBLCX33m3atT,Twenty One Pilots,spotify:artist:3YQKmKGau1PzlVlkL1iodx,195920,2,Heathens,[584466 290613 339107 ... 349806 168239 932309]
3,spotify:track:2DpCdPMg1BADE4HDnxt3Rd,"Sit Still, Look Pretty",spotify:album:2cE2eOy7alOZHpuelJEV8Q,Daya,spotify:artist:6Dd3NScHWwnW6obMFbl1BH,202226,3,"Sit Still, Look Pretty",[584466 414677 809633 ... 6555 349244 395752]
4,spotify:track:6Knv6wdA0luoMUuuoYi2i1,MY HOUSE,spotify:album:5lkNnHVlnCCCV304t89wOH,Flo Rida,spotify:artist:0jnsk9HBra6NMjO2oANoPY,192190,4,My House,[584466 290613 968716 ... 257996 349976 6463]


In [35]:
def get_preview_clip(track_name):
    # Format the track name for the Deezer search
    track_name = track_name.replace(" ", "+")
    
    # Deezer API endpoint for searching track by name
    search_url = f"https://api.deezer.com/search?q={track_name}&limit=1"
    
    response = requests.get(search_url)
    if response.status_code == 200:
        track_data = response.json()
        if track_data['data']:
            preview_url = track_data['data'][0].get('preview', None)
            if preview_url:
                # Fetch the audio content
                audio_response = requests.get(preview_url)
                if audio_response.status_code == 200:
                    audio_data = BytesIO(audio_response.content)
                    return audio_data
                else:
                    print(f"Error downloading preview for track {track_name}")
            else:
                print(f"No preview available for track {track_name}")
        else:
            print(f"No data found for track {track_name}")
    else:
        print(f"Error with Deezer API request for track {track_name}")
    return None

import tempfile
import soundfile as sf

def extract_audio_features(audio_data):
    # Create a temporary file to save the audio data
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
        tmpfile.write(audio_data.read())  # Write the audio data into the temporary file
        tmpfile_path = tmpfile.name
    
    # Load the audio file from the temporary path
    y, sr = librosa.load(tmpfile_path, sr=None)
    
    # Extract Harmonic Content
    harmonic, _ = librosa.effects.hpss(y)
    harmonic_content = np.mean(librosa.feature.chroma_cqt(y=harmonic, sr=sr), axis=1)
    
    # Extract Frequency Distribution (MFCCs)
    mfccs = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).T, axis=0)
    
    # Extract Spectrogram Data
    spectrogram = np.mean(librosa.feature.melspectrogram(y=y, sr=sr).T, axis=0)
    
    return harmonic_content, mfccs, spectrogram

In [36]:
# Initialize lists to store features
harmonic_list = []
mfccs_list = []
spectrogram_list = []

# Loop through all tracks in the dataframe
for track_name in df['track_name']:
    audio_data = get_preview_clip(track_name)
    if audio_data:
        harmonic, mfccs, spectrogram = extract_audio_features(audio_data)
        harmonic_list.append(harmonic)
        mfccs_list.append(mfccs)
        spectrogram_list.append(spectrogram)
    else:
        continue

# Convert lists to numpy arrays
harmonic_features = np.array(harmonic_list)
mfccs_features = np.array(mfccs_list)
spectrogram_features = np.array(spectrogram_list)

