In [23]:
import os
from dotenv import load_dotenv # for python-dotenv method

import math
import random
import pandas as pd
from collections import Counter
from tqdm import tqdm


from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


In [24]:
# Load environment variables from the .env file
load_dotenv()

# Access the client_id and client_secret from environment variables
client_id = os.getenv("CLIENT_ID")
client_secret = os.getenv("CLIENT_SECRET")

# Authentication
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=10)

In [25]:
# Increase timeout and add retry logic
session = sp._session
retry = Retry(
    total=5,  # Total number of retries
    backoff_factor=0.3,  # Wait time between retries
    status_forcelist=[500, 502, 503, 504],  # Retry on these HTTP status codes
    raise_on_status=False
)
adapter = HTTPAdapter(max_retries=retry)  # Increase timeout to 10 seconds
session.mount("https://", adapter)


In [26]:
# Set random seed
random.seed(42)

In [27]:
def test_genre_string(genre, debug=False):
    """
    Input:
        genre: str, genre string to test
        debug: bool, print debug messages
    Output:
        bool: True if the genre is valid, False if not
        
    Tests if a genre string is valid by searching for tracks with that genre. If no tracks are found, the genre is invalid.
    """
    results = sp.search(q=f'genre:{genre}', type='track', limit=1)
    number_of_tracks = len(results['tracks']['items'])
    if number_of_tracks == 0:
        print(f"No tracks found for genre: {genre}")
        return False
    else:
        if debug:
            print(f"Found {number_of_tracks} tracks for genre: {genre}")
        return True

In [28]:
def get_genres_of_interest(genres_dict, genre_record_limit, pagination_limit=45, debug=False):
    """
    Fetches track IDs for specified genres from Spotify, ensuring a balanced representation of each genre.
    Args:
        genres_dict (dict): A dictionary where keys are super genres and values are lists of sub-genres.
        genre_record_limit (int): The maximum number of tracks to fetch per super genre.
        pagination_limit (int, optional): The number of tracks to fetch per API call. Defaults to 45.
        debug (bool, optional): If True, prints debug information. Defaults to False.
    Returns:
        tuple: A tuple containing two lists:
            - track_ids (list): A list of track IDs fetched from Spotify.
            - track_genre (list): A list of super genres corresponding to each track ID.
    Raises:
        AssertionError: If duplicate track IDs are found in the final list.
    """
    
    # Initialize variables
    track_ids = []
    track_genre = []
    seen_track_ids = set()

    for super_genre, sub_genres in genres_dict.items():
        print(f"Getting records for super genre: {super_genre}")
        super_genre_track_ids = []

        # Dictionary to track how many tracks we pulled per sub-genre
        sub_genre_counts = {sub_genre: 0 for sub_genre in sub_genres}

        # Loop until we hit the genre_record_limit for the super genre
        total_tracks_pulled = 0
        
        while total_tracks_pulled < genre_record_limit and sub_genres:
            # Calculate remaining tracks needed for the super genre
            tracks_needed = genre_record_limit - total_tracks_pulled

            # Shuffle sub-genres to randomize the pulls
            random.shuffle(sub_genres)

            for sub_genre in sub_genres[:]:
                
                # Adjust the batch size to ensure we don't exceed the genre_record_limit
                batch_size = min(pagination_limit, tracks_needed)

                # Fetch a batch of tracks for the sub-genre
                results = sp.search(q=f'genre:{sub_genre}', type='track', limit=batch_size, offset=sub_genre_counts[sub_genre])
                
                # If no items are returned, remove the sub-genre and move on
                if not results['tracks']['items']:
                    if debug:
                        print(f"No items for sub-genre: {sub_genre}, removing from sub-genres")
                    sub_genres.remove(sub_genre)  # Remove sub-genre if no more tracks are returned
                    continue  # Skip the rest of the code for this sub-genre
            
                # Add new track IDs that are not already seen, but ensure we don't exceed the genre_record_limit
                new_track_ids = [track['id'] for track in results['tracks']['items'] if track['id'] not in seen_track_ids]
                new_tracks_needed = genre_record_limit - total_tracks_pulled
                
                # Only add as many tracks as needed to reach the limit
                new_track_ids = new_track_ids[:new_tracks_needed]
                
                for track_id in new_track_ids:
                    # Add the new track to the super genre's collection
                    super_genre_track_ids.append(track_id)
                    track_genre.append(super_genre)  # Label the track with the super genre
                    seen_track_ids.add(track_id)
                
                # Update counts and totals
                sub_genre_counts[sub_genre] += len(new_track_ids)
                total_tracks_pulled += len(new_track_ids)

                if debug:
                    print(f"Fetched {len(new_track_ids)} new tracks for sub-genre: {sub_genre}")

                # If we've reached the limit for the super genre, stop
                if total_tracks_pulled >= genre_record_limit:
                    break
                
            # Check again if we've exhausted all sub-genres
            if not sub_genres:
                if debug:
                    print(f"All sub-genres for super genre exhausted.")
                break

        print(f"{len(super_genre_track_ids)} records found for super genre: {super_genre}\n")
        # Add the super genre track ids to the main list
        track_ids.extend(super_genre_track_ids)

    print("Total number of records:", len(track_ids), "\n")

    # Check for duplicates in the final list
    track_id_counts = Counter(track_ids)
    duplicates = {track_id: count for track_id, count in track_id_counts.items() if count > 1}

    # Assert no duplicates
    assert not duplicates, f"Duplicate track IDs found: {duplicates}"

    return track_ids, track_genre


In [29]:
def test_mixed_results_with_incomplete_pages():
    """
    Test the `get_genres_of_interest` function with mixed results and incomplete pages.
    This test checks the following:
    - The function correctly handles multiple genres with a moderate record limit.
    - Pagination is handled correctly by requesting a small number of tracks at a time.
    - The function returns the expected number of tracks for each genre.
    - There are no duplicate tracks in the results.
    - Tracks are evenly distributed across the specified super genres.
    - The function handles incomplete pages gracefully.
    Assertions:
    - The function should return exactly 40 tracks (20 for each super genre).
    - There should be no duplicate tracks in the returned list.
    - Each super genre should have exactly 20 tracks.
    - The function should handle cases where pages return 0 results (check debug logs).
    Prints:
    - A success message if all assertions pass.
    """
    
    genres_dict = {
        'rock': ['rock', 'alt-rock', 'hard-rock'],  # Use popular genres with many potential results
        'rap/hip-hop': ['rap', 'hip-hop']
    }
    genre_record_limit = 20  # Set a moderate limit
    pagination_limit = 5  # Request 5 tracks at a time to test pagination handling

    # Run the function with debug enabled to see the internal workings
    track_ids, track_genre = get_genres_of_interest(genres_dict, genre_record_limit, pagination_limit, debug=True)

    # Assertions
    # Ensure the function returns exactly 40 tracks (20 for each super genre)
    assert len(track_ids) == 40, f"Expected 40 tracks, but got {len(track_ids)}"
    
    # Ensure there are no duplicates
    assert len(track_ids) == len(set(track_ids)), "Duplicate tracks found"

    # Ensure tracks are evenly distributed across super genres
    assert track_genre.count('rock') == 20, f"Expected 20 tracks for 'rock', but got {track_genre.count('rock')}"
    assert track_genre.count('rap/hip-hop') == 20, f"Expected 20 tracks for 'rap/hip-hop', but got {track_genre.count('rap/hip-hop')}"

    # Check if the function handles incomplete pages well (check debug logs for 0 results cases)
    print("Test passed: Mixed results with incomplete pages handled successfully.")




In [30]:
def test_few_tracks_subgenres():
    """
    Test the `get_genres_of_interest` function with sub-genres that have fewer records.
    This test checks the behavior of the `get_genres_of_interest` function when some sub-genres
    have a limited number of tracks available. Specifically, it tests the following:
    - The function's ability to handle sub-genres with fewer records than the specified limit.
    - The function's pagination mechanism with a given pagination limit.
    - Ensures that there are no duplicate track IDs in the result.
    The test uses the following parameters:
    - `genres_dict`: A dictionary with genres and their respective sub-genres.
    - `genre_record_limit`: The maximum number of records to fetch per genre.
    - `pagination_limit`: The limit for pagination to test the function's efficiency.
    Assertions:
    - Ensures that there are no duplicate track IDs in the result.
    Prints:
    - A success message if the test passes without any assertion errors.
    """
    
    genres_dict = {
        'misc': ['bossanova', 'work-out'],  # These sub-genres are known to have fewer records
        'rock': ['rock', 'hard-rock']  # Assume these sub-genres have plenty of tracks
    }
    genre_record_limit = 30  # We expect the 'misc' sub-genres to not be able to fulfill this
    pagination_limit = 10  # Large enough to efficiently test pagination

    # Run the function with debug enabled to observe how it handles the limited sub-genres
    track_ids, track_genre = get_genres_of_interest(genres_dict, genre_record_limit, pagination_limit, debug=True)

    # Assertions
    
    # Ensure there are no duplicates
    assert len(track_ids) == len(set(track_ids)), "Duplicate tracks found"


    print(f"Test passed: 'misc' sub-genres with limited tracks handled successfully.")



In [31]:
def get_other_genres(genres_of_interest, genre_record_limit, already_gathered_tracks, debug=False):
    """
    Fetches tracks from genres not included in the genres_of_interest.
    This function aims to gather a specified number of tracks from genres that are not part of the provided genres_of_interest.
    It ensures an even distribution of tracks across the available genres and avoids duplicates.
    Parameters:
    genres_of_interest (dict): A dictionary where keys are super-genres and values are lists of sub-genres of interest.
    genre_record_limit (int): The total number of tracks to fetch.
    already_gathered_tracks (list or set): A list or set of track IDs that have already been gathered.
    debug (bool, optional): If True, prints debug information. Defaults to False.
    Returns:
    tuple: A tuple containing:
        - other_track_ids (list): A list of track IDs from the 'other' genres.
        - genre_labels (list): A list of genre labels corresponding to the fetched track IDs.
        - genre_counts (dict): A dictionary with genres as keys and the count of fetched tracks as values.
    Raises:
    AssertionError: If duplicate track IDs are found in the final list of track IDs.
    """

    print("Getting 'other' genres.")
    already_gathered_tracks = set(already_gathered_tracks)  # Ensure it's a set for fast lookup

    # Flatten the dictionary to get all sub-genres in genres_of_interest
    sub_genres_of_interest = {sub_genre for super_genre, sub_genres in genres_of_interest.items() for sub_genre in sub_genres}

    # All genres excluding genres_of_interest
    other_genres = [genre for genre in sp.recommendation_genre_seeds()['genres'] if genre not in sub_genres_of_interest]

    # Dictionary to store counts of tracks per genre
    genre_counts = {genre: 0 for genre in other_genres}

    # Set to hold unique track IDs, initialized with already gathered tracks
    other_track_ids = already_gathered_tracks.copy()

    # Track total number of new tracks pulled
    total_tracks_pulled = 0

    # Ensure even distribution across genres
    while total_tracks_pulled < genre_record_limit and other_genres:
        # Dynamically calculate batch size to pull evenly across remaining genres
        remaining_genres = len(other_genres)
        tracks_needed = genre_record_limit - total_tracks_pulled
        batch_size = math.ceil(tracks_needed / remaining_genres)

        # Shuffle genres to avoid bias
        random.shuffle(other_genres)

        # Iterate through genres and fetch tracks
        for genre in other_genres[:]:
            if total_tracks_pulled >= genre_record_limit:
                break  # Stop if we've reached the limit
            
            # Fetch a batch of tracks for the genre
            results = sp.search(q=f'genre:{genre}', type='track', limit=batch_size, offset=genre_counts[genre])
            
            # If no items are returned, remove the genre and move on
            if not results['tracks']['items']:
                if debug:
                    print(f"No items for genre: {genre}, removing from other_genres")
                other_genres.remove(genre)
                continue

            # Add new track IDs that are not already in the gathered or existing set
            new_track_ids = [track['id'] for track in results['tracks']['items'] if track['id'] not in other_track_ids]
            other_track_ids.update(new_track_ids)  # Automatically deduplicate

            # Update counts and totals
            genre_counts[genre] += batch_size
            total_tracks_pulled += len(new_track_ids)

            if debug:
                print(f"Fetched {len(new_track_ids)} new tracks for genre: {genre}")

            # If we've reached the genre record limit, stop
            if total_tracks_pulled >= genre_record_limit:
                break

    print(f"Total number of records: {len(other_track_ids) - len(already_gathered_tracks)} new tracks")

    if debug:
        # Print genre counts only for genres with tracks > 0
        print(f"\nGenre Counts:")
        for genre, count in genre_counts.items():
            if count > 0:
                print(f"{genre}: {count}")

    # Convert back to a list excluding already gathered tracks for returning
    other_track_ids = list(other_track_ids - already_gathered_tracks)
    genre_labels = ["other"] * len(other_track_ids)  # Create genre labels for the new tracks

    # Check for duplicates in the final list of track IDs
    track_id_counts = Counter(other_track_ids)
    duplicates = {track_id: count for track_id, count in track_id_counts.items() if count > 1}

    # Assert no duplicates
    assert not duplicates, f"Duplicate track IDs found: {duplicates}"

    return other_track_ids, genre_labels, genre_counts


In [32]:
def amend_sub_genres(sub_genres):
    """
    Input: A list of sub-genres
    Output: The same list with any sub-genres removed that do not return results from the Spotify API
    """
    sub_genres = sub_genres.copy()
    # Modify the sub_genres list in place
    before = len(sub_genres)
    print(f"Number of sub genres before check: {before}")
    
    # Create a copy of the list to avoid modifying it while iterating
    for genre in sub_genres[:]:
        if not test_genre_string(genre):
            sub_genres.remove(genre)
            print(f"Removed {genre} from sub_genres.")
    
    print(f"Number of sub genres after check: {len(sub_genres)}, {before - len(sub_genres)} removed.")
    
    return sub_genres

Here you can add any string to any list in the dictionary. 

In [33]:
## Actual call ##
genres_of_interest = {
    'rock': [
            'rock',
            'alt-rock',
            'hard-rock',
            'j-rock',
            'psych-rock',
            'punk-rock',
            'rock-n-roll',
            'rockabilly',
            'grunge',
            'punk'
            ],
    'pop': [
            "pop",
            "Dance Pop",
            "Electropop",
            "Indie Pop",
            "Synth-pop",
            "Pop Rock",
            "Teen Pop",
            "Power Pop",
            "Art Pop",
            "Pop Punk",
            "K-Pop",
            "J-Pop",
            "Latin Pop",
            "Dream Pop",
            "Bubblegum Pop",
            "Euro Pop",
            "Pop Rap",
            "Chamber Pop",
            "Baroque Pop",
            "Pop Soul",
            "Acoustic Pop",
            "j-pop",
            "k-pop",
            ],
    'rap/hip-hop': [
                "Hip Hop",
                "Hip-Hop",
                "Rap",
                "Trap",
                "Gangsta Rap",
                "East Coast Hip Hop",
                "West Coast Hip Hop",
                "Conscious Hip Hop",
                "Alternative Hip Hop",
                "Boom Bap",
                "Dirty South",
                "Crunk",
                "Drill",
                "Grime",
                "Cloud Rap",
                "Underground Hip Hop",
                "Emo Rap",
                "Hardcore Hip Hop",
                "Lofi Hip Hop",
                "Old School Hip Hop",
                "Christian Hip Hop",
                "Latin Hip Hop"
                ],
    'classical': [
                "Classical",
                "Baroque",
                "Romantic",
                "Classical",
                "Chamber Music",
                "Symphony",
                "Opera",
                "Choral",
                "Contemporary Classical",
                "Minimalism",
                "Orchestral",
                "Piano",
                "String Quartet",
                "Early Music",
                "Renaissance",
                "Modern Classical",
                "Neoclassical",
                "Impressionism",
                "Avant-Garde",
                "Sacred Classical",
                "Cantata",
                "Piano"
                ],
    'jazz': [
                "Jazz",
                "Bebop",
                "Swing",
                "Smooth Jazz",
                "Cool Jazz",
                "Hard Bop",
                "Free Jazz",
                "Fusion",
                "Modal Jazz",
                "Latin Jazz",
                "Avant-Garde Jazz",
                "Gypsy Jazz",
                "Vocal Jazz",
                "Jazz Funk",
                "Jazz Blues",
                "Soul Jazz",
                "Post-Bop",
                "Ragtime",
                "Big Band",
                "Dixieland",
                "Nu Jazz",
                "Jazz Fusion",
                ]
}

This validates each string in the lists per super genre. If the string is not a recognized genre, it gets removed from the super genre list.

In [None]:
for super_genre in genres_of_interest:
    print(f"\nChecking sub-genres for {super_genre}")
    genres_of_interest[super_genre] = amend_sub_genres(genres_of_interest[super_genre])

You can adjust the genre record limit, each super genre gets a maximum of genre_record_limit records.  
Pagination is passed to the api as the limit parameter. Documentation says the max should be 50, but It kept timing out on me, so I set it to 45.

In [12]:
genre_record_limit = 1500
pagination_limit = 45

In [13]:
track_ids, track_genre = get_genres_of_interest(genres_of_interest, genre_record_limit, pagination_limit)

Getting records for super genre: rock
1500 records found for super genre: rock

Getting records for super genre: pop
1500 records found for super genre: pop

Getting records for super genre: rap/hip-hop
1500 records found for super genre: rap/hip-hop

Getting records for super genre: classical
1500 records found for super genre: classical

Getting records for super genre: jazz
1500 records found for super genre: jazz

Total number of records: 7500 



In [14]:
# Assert length of track_ids is equal to genre_record_limit * number of super genres
assert len(track_ids) == genre_record_limit * len(genres_of_interest), f"Expected {genre_record_limit * len(genres_of_interest)} tracks, but got {len(track_ids)}"

# Assert no duplicates
assert len(track_ids) == len(set(track_ids)), "Duplicate tracks found"

In [15]:
# Get other genres
other_track_ids, other_genre_labels, other_genre_counts = get_other_genres(genres_of_interest, genre_record_limit, track_ids)

# This is helpful to see how many tracks were fetched for each super genre
for genre, count in other_genre_counts.items():
    if count > 0:
        print(f"{genre}: {count}")


Getting 'other' genres.
Total number of records: 1500 new tracks
acoustic: 22
afrobeat: 22
alternative: 22
ambient: 22
anime: 22
black-metal: 22
bluegrass: 22
blues: 23
brazil: 22
breakbeat: 22
british: 22
cantopop: 22
chicago-house: 22
children: 22
chill: 22
classical: 22
club: 22
comedy: 22
country: 22
dance: 22
dancehall: 22
death-metal: 22
deep-house: 22
detroit-techno: 22
disco: 22
drum-and-bass: 22
dub: 22
dubstep: 22
edm: 22
electro: 22
electronic: 23
emo: 22
folk: 22
forro: 22
french: 22
funk: 22
garage: 22
german: 22
gospel: 22
goth: 22
grindcore: 22
groove: 22
guitar: 22
happy: 22
hardcore: 22
hardstyle: 22
heavy-metal: 22
hip-hop: 22
honky-tonk: 22
house: 22
idm: 22
indian: 22
indie: 22
indie-pop: 22
industrial: 22
iranian: 22
j-dance: 22
j-idol: 23
jazz: 22
kids: 22
latin: 22
latino: 22
malay: 22
mandopop: 22
metal: 22
metalcore: 22
minimal-techno: 22
mpb: 22
new-age: 22
opera: 22
pagode: 22
party: 22
piano: 22
pop-film: 23
power-pop: 23
progressive-house: 22
r-n-b: 22
regg

In [17]:
# Assert no track_ids overlap between genres_of_interest and other genres
assert not set(track_ids).intersection(other_track_ids), "Overlap between genres_of_interest and other genres"

In [18]:
# Combine the two lists
all_track_ids = track_ids + other_track_ids
all_track_genre = track_genre + other_genre_labels

# Create a DataFrame
track_genres_df = pd.DataFrame({"track_id": all_track_ids, "genre": all_track_genre})

# Assert no duplicates
assert track_genres_df['track_id'].nunique() == len(track_genres_df), "Duplicate track IDs found in the final DataFrame"


In [19]:
# Get audio features for each track with a progress bar
track_features = []
for i in tqdm(range(0, len(track_genres_df), pagination_limit), desc="Fetching audio features"):
    features = sp.audio_features(all_track_ids[i:i+pagination_limit])
    
    # Raise error if no features are returned
    if not features:
        raise ValueError(f"No audio features returned for tracks: {all_track_ids[i:i+pagination_limit]}")
    
    for feature in features:
        # Raise error if no features are returned for individual tracks
        if not feature:
            raise ValueError(f"No audio features returned for track: {all_track_ids[i:i+pagination_limit]}")
        track_features.append(feature)



Fetching audio features:   0%|          | 0/200 [00:00<?, ?it/s]

Fetching audio features: 100%|██████████| 200/200 [00:28<00:00,  6.93it/s]


In [20]:
# To DataFrame
track_features_df = pd.DataFrame(track_features)
# Rename id to track_id
track_features_df.rename(columns={'id': 'track_id'}, inplace=True)

# Assert no duplicates
assert track_features_df['track_id'].nunique() == len(track_features_df), "Duplicate track IDs found in the final DataFrame"

# Assert same length as track_genres_df
assert len(track_features_df) == len(track_genres_df), "Length of track_features_df and track_genres_df do not match"

In [21]:
# Merge the two DataFrames
all_data = pd.merge(track_genres_df, track_features_df, on='track_id')

print(len(all_data))

9000


In [22]:
# Save the data
all_data.to_csv("spotify_data_more.csv", index=False)

Method for getting features via song title and artist name lookup:

In [23]:
def get_track_features(song_title, artist_name):
    # Search for the song using Spotipy's search function
    result = sp.search(q=f"track:{song_title} artist:{artist_name}", type='track', limit=1)
    
    if result['tracks']['items']:
        # Extract the track ID from the search result
        track = result['tracks']['items'][0]
        track_id = track['id']
        track_name = track['name']
        artist_name = track['artists'][0]['name']
        
        print(f"Found track: {track_name} by {artist_name}")
        
        # Use the track ID to get the song's features
        features = sp.audio_features(track_id)
        return features[0]  # Return the features dictionary
    else:
        print(f"No results found for {song_title} by {artist_name}")
        return None

# Example usage
song_title = "Spybreak-Short One"
artist_name = "Propellerheads"
features = get_track_features(song_title, artist_name)

if features:
    print("Audio Features:")
    print(features)


Found track: Spybreak! - Short One by Propellerheads
Audio Features:
{'danceability': 0.552, 'energy': 0.929, 'key': 1, 'loudness': -8.626, 'mode': 1, 'speechiness': 0.0458, 'acousticness': 7.14e-06, 'instrumentalness': 0.846, 'liveness': 0.18, 'valence': 0.466, 'tempo': 127.834, 'type': 'audio_features', 'id': '6AyXbkn4cwrErFIMbRBRDs', 'uri': 'spotify:track:6AyXbkn4cwrErFIMbRBRDs', 'track_href': 'https://api.spotify.com/v1/tracks/6AyXbkn4cwrErFIMbRBRDs', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/6AyXbkn4cwrErFIMbRBRDs', 'duration_ms': 244400, 'time_signature': 4}
