Imports

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import time
import os
from dotenv import load_dotenv
import pandas as pd

# Load environment variables
load_dotenv()

True

In [2]:
# Set up authentication
client_credentials_manager = SpotifyClientCredentials(
    client_id=os.getenv('SPOTIPY_CLIENT_ID'),
    client_secret=os.getenv('SPOTIPY_CLIENT_SECRET')
)

# Create Spotify client
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [3]:
# Read the CSV file
df_input = pd.read_csv('../data/3_combined_cleaned_df.csv')


In [4]:

# Initialize list to store track data
all_track_data = []

# Process each track
for index, row in df_input.iterrows():
    try:
        # Clean artist name by removing "Featuring" and anything after it
        artist_name = row['artist'].split('Featuring')[0].strip()
        
        # Clean track name by removing content in parentheses and square brackets
        track_name = row['title']
        # Remove content in parentheses
        if '(' in track_name:
            track_name = track_name.split('(')[0]
        # Remove content in square brackets
        if '[' in track_name:
            track_name = track_name.split('[')[0]
        track_name = track_name.strip()
        
        # Add 30-second pause every 50 tracks
        if index > 0 and index % 50 == 0:
            print(f"\nPausing for 30 seconds after processing {index} tracks...")
            time.sleep(30)
            print("Resuming processing...")
        
        # Create search query from cleaned title and artist
        search_query = f"track:{track_name} artist:{artist_name}"
        
        # Search for the track
        results = sp.search(q=search_query, type='track', limit=1)
        
        if results['tracks']['items']:
            track = results['tracks']['items'][0]
            
            # Get artist's genres
            artist_id = track['artists'][0]['id']
            artist_info = sp.artist(artist_id)
            genres = artist_info['genres']
            
            # Create track data dictionary
            track_data = {
                'name': track['name'],
                'artist': track['artists'][0]['name'],
                'uri': track['uri'],
                'popularity': track['popularity'],
                'album_name': track['album']['name'],
                'release_date': track['album']['release_date'],
                'album_art_url': track['album']['images'][0]['url'] if track['album']['images'] else None,
                'genres': ', '.join(genres) if genres else 'No genres available'
            }
            
            all_track_data.append(track_data)
            
            # Print progress
            if index % 10 == 0:
                print(f"Processed {index} tracks...")
            
            # Save intermediate results every 200 tracks
            if (index + 1) % 200 == 0:
                print(f"\nSaving intermediate results at {index + 1} tracks...")
                temp_df = pd.DataFrame(all_track_data)
                temp_df.to_csv('../data/spotify_track_data.csv', index=False)
                print("Intermediate results saved!")
            
            # Add a small delay between each track
            time.sleep(0.5)
            
        else:
            print(f"Track not found: {track_name} by {artist_name}")
            
    except Exception as e:
        print(f"Error processing track {track_name} by {artist_name}: {str(e)}")
        continue

# Create final DataFrame
df_spotify = pd.DataFrame(all_track_data)

# Display info about the results
print("\nProcessing complete!")
print(f"Successfully retrieved data for {len(df_spotify)} tracks")

# Save final results
df_spotify.to_csv('../data/spotify_track_data.csv', index=False)
print("\nFinal results saved to '../data/spotify_track_data.csv'")

Error processing track Die With A Smile by Lady Gaga & Bruno Mars: HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
Error processing track Bad Dreams by Teddy Swims: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Processed 10 tracks...
Processed 20 tracks...
Processed 30 tracks...
Processed 40 tracks...

Pausing for 30 seconds after processing 50 tracks...
Resuming processing...
Processed 50 tracks...
Processed 60 tracks...
Processed 70 tracks...
Processed 80 tracks...
Track not found: 4x4xU by Lainey Wilson
Processed 90 tracks...

Pausing for 30 seconds after processing 100 tracks...
Resuming processing...
Processed 100 tracks...
Processed 110 tracks...
Processed 120 tracks...
Track not found: Get Together - Album Version by Randy Stonehill
Processed 130 tracks...
Processed 140 tracks...

Pausing for 30 seconds after processing 150 tracks...
Resuming processing...
Processed 150 tracks...
Processed 160

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': "track:Sleepy F%#&in' D artist:Sleepy D", 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Invalid string


Error processing track Sleepy F%#&in' D by Sleepy D: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3ASleepy+F%25%23%26in%27+D+artist%3ASleepy+D&limit=1&offset=0&type=track:
 Invalid string, reason: None

Saving intermediate results at 1000 tracks...
Intermediate results saved!

Pausing for 30 seconds after processing 1000 tracks...
Resuming processing...
Processed 1000 tracks...
Processed 1010 tracks...
Processed 1020 tracks...
Track not found: Mothers Rastas by Taya wooden
Processed 1030 tracks...
Track not found: Ven a Mi by David Saylor
Processed 1040 tracks...
Track not found: Los! by Vorstadtkinder

Pausing for 30 seconds after processing 1050 tracks...
Resuming processing...
Processed 1050 tracks...
Track not found: The west's awake by Tony Malone And The Rapparees
Processed 1060 tracks...
Track not found: Could You Be Love by Mariano Yanani
Processed 1070 tracks...
Track not found: Frigide by Caniche Hara-Kiri
Processed 1080 tracks...
Track not found: Two

HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:100% Dundee artist:The Roots', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Invalid string


Error processing track 100% Dundee by The Roots: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3A100%25+Dundee+artist%3AThe+Roots&limit=1&offset=0&type=track:
 Invalid string, reason: None
Processed 1710 tracks...
Track not found: Need Not by A Ghost Devotion
Track not found: Happy Beat by Edmundo Ros
Processed 1720 tracks...


HTTP Error for GET to https://api.spotify.com/v1/search with Params: {'q': 'track:100% artist:Cocoa Tea', 'limit': 1, 'offset': 0, 'type': 'track', 'market': None} returned 400 due to Invalid string


Error processing track 100% by Cocoa Tea: http status: 400, code:-1 - https://api.spotify.com/v1/search?q=track%3A100%25+artist%3ACocoa+Tea&limit=1&offset=0&type=track:
 Invalid string, reason: None
Processed 1730 tracks...
Track not found: Cuore Appassionato by Sergio Franchi
Processed 1740 tracks...
Track not found: Soul Etouffe by Jazz Crusaders

Pausing for 30 seconds after processing 1750 tracks...
Resuming processing...
Processed 1750 tracks...
Track not found: Don't Go by Wordsworth feat. Adanita Ross
Track not found: Myasorubka by Kruger
Processed 1760 tracks...
Track not found: Diamond In The Bluff by Memphis Sheiks
Processed 1770 tracks...
Track not found: No Mercy Remix by The Last Eighth feat. Dumi Right(Zimbabwe Legit) and Skillz
Processed 1780 tracks...
Track not found: Party Favours by Bionikworld
Processed 1790 tracks...

Saving intermediate results at 1800 tracks...
Intermediate results saved!

Pausing for 30 seconds after processing 1800 tracks...
Resuming processing.

In [None]:
# Search for the specific track
results = sp.search(q='track:Espresso artist:Sabrina Carpenter', type='track', limit=1)

if results['tracks']['items']:
    track = results['tracks']['items'][0]
    
    # Get artist's genres
    artist_id = track['artists'][0]['id']
    artist_info = sp.artist(artist_id)
    
    # Get audio features
    audio_features = sp.audio_features(track['id'])[0]
    
    print("Track Information:")
    print(f"Name: {track['name']}")
    print(f"Artist: {track['artists'][0]['name']}")
    print(f"Album: {track['album']['name']}")
    print(f"Release Date: {track['album']['release_date']}")
    print(f"Popularity: {track['popularity']}/100")
    print(f"URI: {track['uri']}")
    print(f"\nArtist Genres: {', '.join(artist_info['genres']) if artist_info['genres'] else 'No genres available'}")

else:
    print("Track not found")

In [None]:
# Test API connection
try:
    test_result = sp.search(q='track:Espresso artist:Sabrina Carpenter', type='track', limit=1)
    print("API Connection Test: Successful")
except Exception as e:
    print(f"API Connection Test Failed: {str(e)}")
    raise  # Stop execution if we can't even make a test request

In [4]:
# Read the CSV file
df_input = pd.read_csv('../data/3_combined_cleaned_df.csv')



In [None]:


# Test with just one row
test_row = df_input.iloc[0]
print("Testing with track:", test_row['title'], "by", test_row['artist'])

try:
    # Clean artist name
    artist_name = test_row['artist'].split('Featuring')[0].strip()
    print("Cleaned artist name:", artist_name)
    
    # Clean track name
    track_name = test_row['title']
    if '(' in track_name:
        track_name = track_name.split('(')[0]
    if '[' in track_name:
        track_name = track_name.split('[')[0]
    track_name = track_name.strip()
    print("Cleaned track name:", track_name)
    
    # Create and print search query
    search_query = f"track:{track_name} artist:{artist_name}"
    print("Search query:", search_query)
    
    # Search with timeout
    print("Sending API request...")
    results = sp.search(q=search_query, type='track', limit=1)
    print("Got results:", bool(results['tracks']['items']))
    
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        print("Found track:", track['name'], "by", track['artists'][0]['name'])
except Exception as e:
    print(f"Error occurred: {str(e)}")

In [None]:
# Test with first 5 rows
for index, row in df_input.head().iterrows():
    print(f"\nProcessing row {index}")
    try:
        # Clean artist name
        artist_name = row['artist'].split('Featuring')[0].strip()
        
        # Clean track name
        track_name = row['title']
        if '(' in track_name:
            track_name = track_name.split('(')[0]
        if '[' in track_name:
            track_name = track_name.split('[')[0]
        track_name = track_name.strip()
        
        print(f"Searching for: {track_name} by {artist_name}")
        
        # Search for the track
        results = sp.search(q=f"track:{track_name} artist:{artist_name}", type='track', limit=1)
        
        if results['tracks']['items']:
            print("Found match!")
        else:
            print("No match found")
            
    except Exception as e:
        print(f"Error: {str(e)}")
        continue

In [None]:
# Test with just the first row with detailed debugging
import requests.exceptions
from requests.exceptions import ReadTimeout, ConnectionError

test_row = df_input.iloc[0]
print("Test row data:")
print(f"Title: {test_row['title']}")
print(f"Artist: {test_row['artist']}")

try:
    # Clean artist name
    artist_name = test_row['artist'].split('Featuring')[0].strip()
    print("Cleaned artist name:", artist_name)
    
    # Clean track name
    track_name = test_row['title']
    if '(' in track_name:
        track_name = track_name.split('(')[0]
    if '[' in track_name:
        track_name = track_name.split('[')[0]
    track_name = track_name.strip()
    print("Cleaned track name:", track_name)
    
    # Create and print search query
    search_query = f"track:{track_name} artist:{artist_name}"
    print("Search query:", search_query)
    
    # Search with explicit timeout
    print("Sending API request...")
    results = sp.search(
        q=search_query, 
        type='track', 
        limit=1, 
        requests_timeout=5  # 5 second timeout
    )
    print("Request completed!")
    
    if results['tracks']['items']:
        track = results['tracks']['items'][0]
        print("Found track:", track['name'], "by", track['artists'][0]['name'])
    else:
        print("No matching tracks found")
        
except ReadTimeout:
    print("Request timed out after 5 seconds")
except ConnectionError:
    print("Connection error occurred")
except spotipy.exceptions.SpotifyException as e:
    print(f"Spotify API error: {str(e)}")
    if hasattr(e, 'http_status'):
        print(f"HTTP Status: {e.http_status}")
    if hasattr(e, 'headers'):
        print(f"Response Headers: {e.headers}")
except Exception as e:
    print(f"Unexpected error: {str(e)}")
    print(f"Error type: {type(e)}")