In [23]:
# basic libraries 
import os
import pandas as pd
import numpy as np

# Spotify API calls 
import requests
import spotipy
from spotipy.oauth2 import SpotifyOAuth

# Cosine similarity 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import pandas as pd

### Approach 1: choose based on cosine similarity

In [24]:
# The list of Spotify IDs
def get_seed_songs(df):
    seed_songs_list = list(df['id'].sample(5, random_state=42))
    print(f"seed_songs_list: {seed_songs_list}")
    return seed_songs_list

In [3]:
# get each song's release year 
def get_years_from_spotify_ids(df, seed_songs_list):
    years_list = []
    for spotify_id in seed_songs_list:
        filtered_row = df[df['id'] == spotify_id]
        matching_year = filtered_row.iloc[0]['year']
        years_list.append(matching_year)
    print(f"years_list: {years_list}")
    return years_list

In [10]:
# get a sub_df that contains specific years from seed songs
def get_sub_df_dict(df, years_list):
    sub_df_dict = {}
    for year in years_list:
        sub_df = df[df['year'] == year]
        sub_df_dict[year] = sub_df
        print(f"len(sub_df[year]): {len(sub_df[year])}")
    return sub_df_dict

In [11]:
# find top 10 songs per seed song, drop the top 10 songs of sub_df if two 
# or more seed songs are from the same year
def find_similar_songs(df, sub_df_dict, song_id):
    # get year_specific_df based on song_id
    year = df.loc[df['id'] == song_id, 'year'].values[0]
    sub_df = sub_df_dict[year]
    sub_df.reset_index(drop=True, inplace=True)
    
    # filter numerical, key and id columns 
    numerical_columns = ['danceability', 'energy', 'loudness', 'mode', 'speechiness', 'acousticness',
                            'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
    data = sub_df[['id'] + numerical_columns + ['key']]  
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(data[numerical_columns])

    # Combine normalized data, one-hot encoded 'key' column, and spotify_id
    encoded_key = pd.get_dummies(data['key'], prefix='key')
    combined_data = pd.concat([pd.DataFrame(normalized_data, columns=numerical_columns), encoded_key], axis=1)
    data_with_encoded_key = pd.concat([data[['id']], combined_data], axis=1)
    data_with_encoded_key.reset_index(drop=True, inplace=True)

    # Calculate cosine similarity matrix after dropping the song_to_compare
    song_to_compare = data_with_encoded_key[data_with_encoded_key['id'] == song_id]
    df_without_seed = data_with_encoded_key.drop(song_to_compare.index)
    song_to_compare = song_to_compare.drop('id', axis = 1)
    df_without_seed = df_without_seed.drop('id', axis = 1)

    # Getting the top 10 most similar songs
    cosine_sim = cosine_similarity(song_to_compare, df_without_seed)
    similarity_scores = list(enumerate(cosine_sim[0]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)[1:11]
    top_10_indices = [i[0] for i in similarity_scores]
    top_10_songs = sub_df.iloc[top_10_indices]
    
    # update sub_df_dict 
    sub_df = sub_df[~sub_df['id'].isin(top_10_songs['id'])]
    sub_df_dict[year] = sub_df

    # Set the index of top_10_songs to the original index of sub_df_2014
    top_10_songs.reset_index(drop=True, inplace=True)
    top_10_songs_id = list(top_10_songs['id'])
    return sub_df_dict, top_10_songs_id


In [12]:
# using spotify API to get the preview url 
def get_preview_url(spotify_id):
    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
        client_id="0c8dbb1cfb78479d9613930fbc9ad872",
        client_secret="260a210533424c4592cb3d24d1da7b85",
        redirect_uri="https://www.audiocontentanalysis.org/",
        scope="user-top-read",
    ))
    track = sp.track(spotify_id)
    url = track['preview_url']
    return spotify_id, url

# download the audio in the correct directory 
def get_preview_audio(spotify_id, url, seed_song, seed_song_id = None):
    response = requests.get(url)
    if response.status_code == 200:
        folder_name = 'seed_songs' if seed_song else str(seed_song_id)
        os.makedirs(folder_name, exist_ok=True)
        
        with open(f'{folder_name}/{spotify_id}.mp3', 'wb') as f:
            f.write(response.content)
    else:
        print(f"{spotify_id} preview is not available")
        return spotify_id

def get_preview_audio_from_list(spotify_id_list, seed_song, seed_song_id=None):
    no_preview_spotify_id_list = []
    
    for spotify_id in spotify_id_list:
        spotify_id, url = get_preview_url(spotify_id)
        result = get_preview_audio(spotify_id, url, seed_song, seed_song_id if not seed_song else None)
        if not result:
            no_preview_spotify_id_list.append(spotify_id)
    
    print(f"no_preview_spotify_id_list: {no_preview_spotify_id_list}")

In [13]:
def get_recommendation_pool(df):
    # get seed songs randomly 
    seed_songs_list = get_seed_songs(df)
    
    # get years based on seed songs
    years_list = get_years_from_spotify_ids(df, seed_songs_list)
    
    # set sub_df for each seed song's year 
    sub_df_dict = get_sub_df_dict(df, years_list)
    
    # get audio for each seed song in the seed_songs folder
    get_preview_audio_from_list(seed_songs_list, True)
    
    # get audio and ID of top 10 songs per seed song 
    for seed_song_id in seed_songs_list:
        print(seed_song_id) # ADD THIS LINE 
        sub_df_dict, top_10_songs_id = find_similar_songs(df, sub_df_dict, seed_song_id)
        get_preview_audio_from_list(seed_songs_list, False, seed_song_id)
        print(f"Seed Song ID: {seed_song_id}")
        print(top_10_songs_id)
        print()

In [8]:
def main_model():
    df = pd.read_csv('tracks_features.csv')
    get_recommendation_pool(df) # it should creates a seed_song folder and the corresponding top_10 per seed song

In [14]:
main_model()

seed_songs_list: ['1aGS6nf2xgv3Xzdob4eOO3', '0fJfoqHIIiET2EcgjOfntG', '0V2R2LC8dR7S0REieXRaGt', '4VUHYLocWOJ2GfvP78AmSs', '4m8a1AtmCnoeRzSYoQ0oX0']
years_list: [2006, 2008, 1991, 2013, 2013]


SpotifyOauthError: error: invalid_client, error_description: Invalid client

### Approach 2: Randomly select 50 songs

In [18]:
# using spotify API to get the preview url 
def get_preview_url(spotify_id):
    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
        client_id="0c8dbb1cfb78479d9613930fbc9ad872",
        client_secret="260a210533424c4592cb3d24d1da7b85",
        redirect_uri="https://www.audiocontentanalysis.org/",
        scope="user-top-read",
    ))
    track = sp.track(spotify_id)
    url = track['preview_url']
    return spotify_id, url

# download the audio in the correct directory 
def get_preview_random_audio(spotify_id, url):
    response = requests.get(url)
    if response.status_code == 200:
        folder_name = 'random_songs'
        os.makedirs(folder_name, exist_ok=True)
        
        with open(f'{folder_name}/{spotify_id}.mp3', 'wb') as f:
            f.write(response.content)
    else:
        print(f"{spotify_id} preview is not available")
        return spotify_id

def get_preview_audio_from_random_list(spotify_id_list):
    no_preview_spotify_id_list = []
    
    for spotify_id in spotify_id_list:
        spotify_id, url = get_preview_url(spotify_id)
        result = get_preview_random_audio(spotify_id, url)
        if not result:
            no_preview_spotify_id_list.append(spotify_id)
    
    print(f"no_preview_spotify_id_list: {no_preview_spotify_id_list}")

In [21]:
def main_random():
    df = pd.read_csv('tracks_features.csv')
    random_ids = list(df['id'].sample(n=50, random_state=np.random.RandomState()))
    get_preview_audio_from_random_list(random_ids)

In [22]:
main_random()

SpotifyOauthError: error: invalid_client, error_description: Invalid client

In [None]:
'''
Archive just in case
def get_top_n_years(df, n):
    years_list = df['year']
    x = Counter(years_list)
    print(x.most_common(n))
    # [(2020, 69726), (2019, 67276), (2006, 56945), (2007, 56287), (2018, 56167)]
    return [key for key, _ in x.most_common(n)]

top_n_years = get_top_n_years(df, 5)
print(top_n_years) # [2020, 2019, 2006, 2007, 2018]

def get_sub_df(top_n_years):
    sub_df_list = []
    for year in top_n_years:
        sub_df = df[df['year'] == year]
        sub_df_list.append(sub_df)
    return sub_df_list

sub_df_list = get_sub_df(top_n_years)

def get_seed_song(top_n_years):
    seed_song_list = []
    for year in top_n_years:
        sub_df = df[df['year'] == year]
        random_index = np.random.randint(0, len(sub_df))
        seed_song = df.iloc[random_index]
        seed_song_list.append(seed_song)
    return seed_song_list

seed_songs = get_seed_song(top_n_years)
len(seed_songs)

sub_df_2014 = sub_df_dict[2014]

spotify_id = '2lbASgTSoDO7MTuLAXlTW0'
url = 'https://p.scdn.co/mp3-preview/bb2c5bcede6c752f54dfa07e20e75aceb042c1a6?cid=1fcc7aab9680453bb16ddbdba77674ea'
response = requests.get(url)
if response.status_code == 200:
    with open(f'{spotify_id}.mp3', 'wb') as f:
        f.write(response.content)'''