Spotify app redirect URI: http://localhost:8888/callback

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import requests
import os
import json
import base64

client_id = os.getenv('SPOTIFY_CLIENT_ID')
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')

## Data Collection

Gathering information from various sources.

I am using the Spotify Million Playlist dataset and the Spotify Web API to get the following features every song - 

1. track_id
2. artist_name
3. track_name
4. album_name
5. acousticness
6. danceability
7. energy
8. instrumentalness
9. liveness
10. loudness
11. speechiness
12. tempo
13. valence

I am building a dataframe that will be used to further build and optimize the machine learning models.


In [29]:
# Function to get an access token from Spotify
def get_spotify_token(client_id, client_secret):
    client_creds = f"{client_id}:{client_secret}"
    # client_creds_b64 = base64.b64encode(client_creds.encode()).decode()  # Ensure this is decoded back to string
    client_creds_b64 = "ZWUxYzg1OWZhMTgwNDIyMzhiODIwM2IxMWNlOTMzMjU6ZDUwMjgxOTM2OWJhNDVjYWExYWJlYWRmYjIxYTc3YWU="   # TODO

    url = 'https://accounts.spotify.com/api/token'
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
        'Authorization': f'Basic {client_creds_b64}'
    }
    payload = {'grant_type': 'client_credentials'}

    response = requests.post(url, headers=headers, data=payload)
    if response.status_code != 200:
        print("Failed to fetch token:")
        print("Status Code:", response.status_code)
        print("Response:", response.text)
    else:
        print(response.json())
    return response.json().get('access_token')

token = get_spotify_token(client_id, client_secret)
token

{'access_token': 'BQABxrYWlPKX-64eun0ZYWldYFAsW69c1GcEtvT8TkQaY0Glt5zb1j03QVT5bvw4AT51ql6KbLfFt1upjW9l-InBclFJbaYZzRNBKF2zmVTSNbEq-Kg', 'token_type': 'Bearer', 'expires_in': 3600}


'BQABxrYWlPKX-64eun0ZYWldYFAsW69c1GcEtvT8TkQaY0Glt5zb1j03QVT5bvw4AT51ql6KbLfFt1upjW9l-InBclFJbaYZzRNBKF2zmVTSNbEq-Kg'

In [19]:
# Example to fetch track based on query
def fetch_tracks(token, search_query):
    url = f"https://api.spotify.com/v1/search?q={search_query}&type=track&limit=20"
    headers = {'Authorization': f'Bearer {token}'}
    response = requests.get(url, headers=headers)
    tracks = response.json()
    return tracks

tracks = fetch_tracks(token, 'Beatles')
print(tracks)

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=Beatles&type=track&offset=0&limit=20', 'items': [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3WrFJ7ztbogyGnTHbHJFl2'}, 'href': 'https://api.spotify.com/v1/artists/3WrFJ7ztbogyGnTHbHJFl2', 'id': '3WrFJ7ztbogyGnTHbHJFl2', 'name': 'The Beatles', 'type': 'artist', 'uri': 'spotify:artist:3WrFJ7ztbogyGnTHbHJFl2'}], 'available_markets': ['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA', 'CL', 'CO', 'CR', 'CY', 'CZ', 'DK', 'DO', 'DE', 'EC', 'EE', 'SV', 'FI', 'FR', 'GR', 'GT', 'HN', 'HK', 'HU', 'IS', 'IE', 'IT', 'LV', 'LT', 'LU', 'MY', 'MT', 'MX', 'NL', 'NZ', 'NI', 'NO', 'PA', 'PY', 'PE', 'PH', 'PL', 'PT', 'SG', 'SK', 'ES', 'SE', 'CH', 'TW', 'TR', 'UY', 'US', 'GB', 'AD', 'LI', 'MC', 'ID', 'JP', 'TH', 'VN', 'RO', 'IL', 'ZA', 'SA', 'AE', 'BH', 'QA', 'OM', 'KW', 'EG', 'MA', 'DZ', 'TN', 'LB', 'JO', 'PS', 'IN', 'BY', 'KZ', 'MD', 'UA', 'AL', 'BA', 'HR', 'ME', 'MK', 'RS', 'SI', '

In [20]:
# Function to fetch track details and audio features
def fetch_track_data(track_id, token):
    headers = {'Authorization': f'Bearer {token}'}
    track_url = f"https://api.spotify.com/v1/tracks/{track_id}"
    features_url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    
    # Fetch track details
    track_response = requests.get(track_url, headers=headers)
    track_details = track_response.json()
    
    # Fetch audio features
    features_response = requests.get(features_url, headers=headers)
    features_details = features_response.json()
    
    # Combine both responses
    track_data = {
        'track_id': track_id,
        'artist_name': track_details.get('artists', [{}])[0].get('name', ''),
        'track_name': track_details.get('name', ''),
        'album_name': track_details.get('album', {}).get('name', ''),
        'acousticness': features_details.get('acousticness', 0),
        'danceability': features_details.get('danceability', 0),
        'energy': features_details.get('energy', 0),
        'instrumentalness': features_details.get('instrumentalness', 0),
        'liveness': features_details.get('liveness', 0),
        'loudness': features_details.get('loudness', 0),
        'speechiness': features_details.get('speechiness', 0),
        'tempo': features_details.get('tempo', 0),
        'valence': features_details.get('valence', 0)
    }
    return track_data

In [28]:
# Function to transform and load Spotify Million Playlist Dataset into a dataframe for building machine learning model
# Number of songs is set to 5000
def process_playlists(token, path, max_songs=10):
    # set to store unique track URIs
    unique_tracks = set()
    data = []
    filenames = os.listdir(path)
    
    # Loop through all the json files at given location to parse then for playlist data
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            
            # Loop through all the playlists in the json file to get the track URIs
            for playlist in mpd_slice["playlists"]:
                
                # Loop through every track
                for track in playlist['tracks']:
                    track_id = track['track_uri'].split(':')[-1]
                    
                    # Check that the track hasn't already been processed
                    if track_id not in unique_tracks:
                        unique_tracks.add(track_id)
                        
                        # Function call to get all the features for the track by using the track URI and Spotify API call
                        track_data = fetch_track_data(track_id, token)
                        data.append(track_data)
                        
                        # Break loop if max song count reached
                        if len(data) >= max_songs:
                            break
                            
    return data, unique_tracks

data, unique_tracks = process_playlists(token, "/Users/noopurparagphadkar/Downloads/spotify_million_playlist_dataset/data/")
print(data)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [24]:
data, unique_tracks = process_playlists(token, "/Users/noopurparagphadkar/Downloads/spotify_million_playlist_dataset/data/")
print(data)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
pd.DataFrame(data)

## Data Preprocessing

Cleaning and organizing raw data into a suitable format for analysis.

## Exploratory Data Analysis (EDA)

Analyzing the data to summarize main characteristics and visualize trends.

## Feature Selection

## Model Building

Building models to make predictions or understand patterns.

## Model Evaluation

## Model Optimization