# Data Collection

## Importing libraries

In [1]:
import requests
import base64
from credentials import spotify_client_id, spotify_client_secret # stored my client ID and client secrect in credentials file.
from tqdm import tqdm
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth

## Getting authorization token from spotify.
1. Go to spotify developers website and create an app.
2. Generate client ID and client secret.
3. Using request library to send a post request to get the access token

In [2]:
# Replace with your own Client ID and Client Secret
CLIENT_ID = spotify_client_id
CLIENT_SECRET = spotify_client_secret

# Base64 encode the client ID and client secret
client_credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"
client_credentials_base64 = base64.b64encode(client_credentials.encode())

# Request the access token
token_url = 'https://accounts.spotify.com/api/token'
headers = {
    'Authorization': f'Basic {client_credentials_base64.decode()}'
}
data = {
    'grant_type': 'client_credentials'
}
response = requests.post(token_url, data=data, headers=headers)

if response.status_code == 200:
    access_token = response.json()['access_token']
    print("Access token obtained successfully.")
else:
    print("Error obtaining access token.")
    exit()

Access token obtained successfully.


In [3]:
def get_trending_playlist_data(playlist_id, access_token):
    """Takes playlist_id and access_token and returns music dataframe with information of the songs in playlist"""
    # Set up Spotipy with the access token
    sp = spotipy.Spotify(auth=access_token)

    # Get the tracks from the playlist
    playlist_tracks = sp.playlist_tracks(playlist_id, fields='items(track(id, name, artists, album(id, name)))')

    # Extract relevant information and store in a list of dictionaries
    music_data = []
    for track_info in playlist_tracks['items']:
        track = track_info['track']
        track_name = track['name']
        artists = ', '.join([artist['name'] for artist in track['artists']])
        album_name = track['album']['name']
        album_id = track['album']['id']
        track_id = track['id']

        # Get audio features for the track
        audio_features = sp.audio_features(track_id)[0] if track_id != 'Not available' else None

        # Get release date of the album
        try:
            album_info = sp.album(album_id) if album_id != 'Not available' else None
            release_date = album_info['release_date'] if album_info else None
        except:
            release_date = None

        # Get popularity of the track
        try:
            track_info = sp.track(track_id) if track_id != 'Not available' else None
            popularity = track_info['popularity'] if track_info else None
        except:
            popularity = None

        # Add additional track information to the track data
        track_data = {
            'Track Name': track_name,
            'Artists': artists,
            'Album Name': album_name,
            'Album ID': album_id,
            'Track ID': track_id,
            'Popularity': popularity,
            'Release Date': release_date,
            'Duration (ms)': audio_features['duration_ms'] if audio_features else None,
            'Explicit': track_info.get('explicit', None),
            'External URLs': track_info.get('external_urls', {}).get('spotify', None),
            'Danceability': audio_features['danceability'] if audio_features else None,
            'Energy': audio_features['energy'] if audio_features else None,
            'Key': audio_features['key'] if audio_features else None,
            'Loudness': audio_features['loudness'] if audio_features else None,
            'Mode': audio_features['mode'] if audio_features else None,
            'Speechiness': audio_features['speechiness'] if audio_features else None,
            'Acousticness': audio_features['acousticness'] if audio_features else None,
            'Instrumentalness': audio_features['instrumentalness'] if audio_features else None,
            'Liveness': audio_features['liveness'] if audio_features else None,
            'Valence': audio_features['valence'] if audio_features else None,
            'Tempo': audio_features['tempo'] if audio_features else None,
        }

        music_data.append(track_data)

    # Create a pandas DataFrame from the list of dictionaries
    df = pd.DataFrame(music_data)

    return df

In [4]:
# List of playlists from spotify
stash = "63ZmJ7sbMDbxocch9W0Cll"
bolly = "0ePwaivWKqORmpsGKvcid8"
hits = "37i9dQZF1DX0XUfTFmNBRM"
topglobal = "37i9dQZEVXbNG2KDcFcKOF"
topindia = "37i9dQZEVXbMWDif5SCBJq"
playlists = [stash, bolly, hits, topglobal, topindia]

## Creating DataFrame of the songs

In [5]:
df = pd.DataFrame()
for playlist_id in tqdm(playlists):
    music_df = get_trending_playlist_data(playlist_id, access_token)
    df = pd.concat([df,music_df])

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:46<00:00, 45.36s/it]


In [6]:
df.shape

(335, 21)

In [7]:
# Droping duplicate songs
df.drop_duplicates(inplace=True)
df.shape

(313, 21)

## Saving DataFrame in a csv.

In [8]:
df.to_csv("music_dataset.csv")