# Data Collection

## Importing libraries

In [1]:
import requests
import base64
from credentials import spotify_client_id, spotify_client_secret # stored my client ID and client secrect in credentials file.
from tqdm import tqdm
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyOAuth
import time

## Getting authorization token from spotify.
1. Go to spotify developers website and create an app.
2. Generate client ID and client secret.
3. Using request library to send a post request to get the access token

In [2]:
# Replace with your own Client ID and Client Secret
CLIENT_ID = spotify_client_id
CLIENT_SECRET = spotify_client_secret

# Base64 encode the client ID and client secret
client_credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"
client_credentials_base64 = base64.b64encode(client_credentials.encode())

# Request the access token
token_url = 'https://accounts.spotify.com/api/token'
headers = {
    'Authorization': f'Basic {client_credentials_base64.decode()}'
}
data = {
    'grant_type': 'client_credentials'
}
response = requests.post(token_url, data=data, headers=headers)

if response.status_code == 200:
    access_token = response.json()['access_token']
    print("Access token obtained successfully.")
else:
    print("Error obtaining access token.")
    exit()

Access token obtained successfully.


In [3]:
def get_trending_playlist_data(playlist_id, access_token):
    """Takes playlist_id and access_token and returns music dataframe with information of the songs in playlist"""
    # Set up Spotipy with the access token
    sp = spotipy.Spotify(auth=access_token)

    # Get the tracks from the playlist
    playlist_tracks = sp.playlist_tracks(playlist_id, fields='items(track(id, name, artists, album(id, name)))')

    # Extract relevant information and store in a list of dictionaries
    music_data = []
    for track_info in playlist_tracks['items']:
        track = track_info['track']
        track_name = track['name']
        artists = ', '.join([artist['name'] for artist in track['artists']])
        album_name = track['album']['name']
        album_id = track['album']['id']
        track_id = track['id']

        # Get audio features for the track
        audio_features = sp.audio_features(track_id)[0] if track_id != 'Not available' else None

        # Get release date of the album
        try:
            album_info = sp.album(album_id) if album_id != 'Not available' else None
            release_date = album_info['release_date'] if album_info else None
        except:
            release_date = None

        # Get popularity of the track
        try:
            track_info = sp.track(track_id) if track_id != 'Not available' else None
            popularity = track_info['popularity'] if track_info else None
        except:
            popularity = None

        # Add additional track information to the track data
        track_data = {
            'Track Name': track_name,
            'Artists': artists,
            'Album Name': album_name,
            'Album ID': album_id,
            'Track ID': track_id,
            'Popularity': popularity,
            'Release Date': release_date,
            'Duration (ms)': audio_features['duration_ms'] if audio_features else None,
            'Explicit': track_info.get('explicit', None),
            'External URLs': track_info.get('external_urls', {}).get('spotify', None),
            'Danceability': audio_features['danceability'] if audio_features else None,
            'Energy': audio_features['energy'] if audio_features else None,
            'Key': audio_features['key'] if audio_features else None,
            'Loudness': audio_features['loudness'] if audio_features else None,
            'Mode': audio_features['mode'] if audio_features else None,
            'Speechiness': audio_features['speechiness'] if audio_features else None,
            'Acousticness': audio_features['acousticness'] if audio_features else None,
            'Instrumentalness': audio_features['instrumentalness'] if audio_features else None,
            'Liveness': audio_features['liveness'] if audio_features else None,
            'Valence': audio_features['valence'] if audio_features else None,
            'Tempo': audio_features['tempo'] if audio_features else None,
        }

        music_data.append(track_data)

    # Create a pandas DataFrame from the list of dictionaries
    df = pd.DataFrame(music_data)

    return df

In [4]:
# List of playlists from spotify
stash = "63ZmJ7sbMDbxocch9W0Cll"
bolly = "0ePwaivWKqORmpsGKvcid8"
hits = "37i9dQZF1DX0XUfTFmNBRM"
topglobal = "37i9dQZEVXbNG2KDcFcKOF"
topindia = "37i9dQZEVXbMWDif5SCBJq"
playlists = [stash, bolly, hits, topglobal, topindia]

## Creating DataFrame of the songs

In [5]:
df = pd.DataFrame()
for playlist_id in tqdm(playlists):
    music_df = get_trending_playlist_data(playlist_id, access_token)
    df = pd.concat([df,music_df])

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [03:46<00:00, 45.36s/it]


In [6]:
df.shape

(335, 21)

In [7]:
# Droping duplicate songs
df.drop_duplicates(inplace=True)
df.shape

(313, 21)

## Saving DataFrame in a csv.

In [8]:
df.to_csv("music_dataset.csv")

## Adding additional playlists
#### Adding "this is artist" playlists of various artists

In [4]:
adam_levine = "37i9dQZF1DZ06evO2rXdlu"
alan_walker = "37i9dQZF1DZ06evO4rvWRa"
araina_grande = "37i9dQZF1DX1PfYnYcpw8w"
avicii = "37i9dQZF1DZ06evO0PRpBu"
benny_blanco = "37i9dQZF1DZ06evO3j6tfn"
beyonce = "37i9dQZF1DX2oU49YwtXI2"
billie_eilish = "37i9dQZF1DX6cg4h2PoN9y"
bruno_mars = "37i9dQZF1DZ06evO03DwPK"
calvin_harris = "37i9dQZF1DZ06evO4vD8f6"
camila_cabello = "37i9dQZF1DZ06evO2yXXGB"
coldplay = "37i9dQZF1DXaQm3ZVg9Z2X"
dj_snake = "37i9dQZF1DZ06evO2YsPPW"
ed_sheeran = "37i9dQZF1DWWxPM4nWdhyI"
hailee_steinfeld = "37i9dQZF1DZ06evO3banUl"
halsey = "37i9dQZF1DZ06evO1ciP4I"
imagine_dragons = "37i9dQZF1DZ06evO2YqUuI"
justin_bieber ="37i9dQZF1DXc2aPBXGmXrt"
kygo = "37i9dQZF1DZ06evO1a5BFC"
linkin_park = "37i9dQZF1DZ06evO47cwRq"
magic = "37i9dQZF1DZ06evO0jjjFK"
maroon_5 = "37i9dQZF1DZ06evNZY5NHq"
marshmellow = "37i9dQZF1DZ06evO3Adu8w"
martin_garrix = "37i9dQZF1DX94qaYRnkufr"
miley_cyrus = "37i9dQZF1DZ06evO3wzrW0"
nirvana = "37i9dQZF1DZ06evO3M0Fbi"
one_republic = "37i9dQZF1DZ06evO3r3n5S"
rihanna = "37i9dQZF1DZ06evO3by276"
shawn_mendes = "37i9dQZF1DZ06evO4mwRSU"
taylor_swift = "37i9dQZF1DX5KpP2LN299J"
twenty_one_pilots = "37i9dQZF1DZ06evO2k3tf2"
the_weeknd = "37i9dQZF1DX6bnzK9KPvrz"
zayn = "37i9dQZF1DZ06evO3x2k4o"

In [5]:
playlist = [adam_levine, alan_walker, araina_grande, avicii, benny_blanco, beyonce, billie_eilish, bruno_mars, calvin_harris, 
           camila_cabello, coldplay, dj_snake, ed_sheeran, hailee_steinfeld, halsey, imagine_dragons, justin_bieber, kygo, 
           linkin_park, magic, maroon_5, marshmellow, martin_garrix, miley_cyrus, nirvana, one_republic, rihanna, shawn_mendes, 
           taylor_swift, twenty_one_pilots, the_weeknd, zayn]

In [6]:
df_artists = pd.DataFrame()
for playlist_id in tqdm(playlist):
    try:
        music_df = get_trending_playlist_data(playlist_id, access_token)
        df_artists = pd.concat([df_artists,music_df])
        print(f"{playlist_id} successful")
        time.sleep(10)
    except:
        print(f"{playlist_id} failed")
        pass

  0%|                                                                                           | 0/32 [00:00<?, ?it/s]

37i9dQZF1DZ06evO2rXdlu successful


  3%|██▌                                                                                | 1/32 [00:36<18:55, 36.61s/it]

37i9dQZF1DZ06evO4rvWRa successful


  6%|█████▏                                                                             | 2/32 [01:19<20:17, 40.58s/it]

37i9dQZF1DX1PfYnYcpw8w successful


  9%|███████▊                                                                           | 3/32 [02:20<24:04, 49.81s/it]

37i9dQZF1DZ06evO0PRpBu successful


 12%|██████████▍                                                                        | 4/32 [03:07<22:40, 48.60s/it]

37i9dQZF1DZ06evO3j6tfn successful


 16%|████████████▉                                                                      | 5/32 [03:36<18:43, 41.61s/it]

37i9dQZF1DX2oU49YwtXI2 successful


 19%|███████████████▌                                                                   | 6/32 [04:32<20:04, 46.32s/it]

37i9dQZF1DX6cg4h2PoN9y successful


 22%|██████████████████▏                                                                | 7/32 [05:21<19:42, 47.30s/it]

37i9dQZF1DZ06evO03DwPK successful


 25%|████████████████████▊                                                              | 8/32 [06:00<17:49, 44.55s/it]

37i9dQZF1DZ06evO4vD8f6 successful


 28%|███████████████████████▎                                                           | 9/32 [06:42<16:50, 43.93s/it]

37i9dQZF1DZ06evO2yXXGB successful


 31%|█████████████████████████▋                                                        | 10/32 [07:20<15:21, 41.90s/it]

37i9dQZF1DXaQm3ZVg9Z2X successful


 34%|████████████████████████████▏                                                     | 11/32 [08:00<14:29, 41.39s/it]

37i9dQZF1DZ06evO2YsPPW successful


 38%|██████████████████████████████▊                                                   | 12/32 [08:39<13:37, 40.86s/it]

37i9dQZF1DWWxPM4nWdhyI successful


 41%|█████████████████████████████████▎                                                | 13/32 [09:42<15:01, 47.44s/it]

37i9dQZF1DZ06evO3banUl successful


 44%|███████████████████████████████████▉                                              | 14/32 [10:12<12:36, 42.04s/it]

37i9dQZF1DZ06evO1ciP4I successful


 47%|██████████████████████████████████████▍                                           | 15/32 [10:52<11:45, 41.52s/it]

37i9dQZF1DZ06evO2YqUuI successful


 50%|█████████████████████████████████████████                                         | 16/32 [11:26<10:30, 39.42s/it]

37i9dQZF1DXc2aPBXGmXrt successful


 53%|███████████████████████████████████████████▌                                      | 17/32 [12:24<11:14, 44.98s/it]

37i9dQZF1DZ06evO1a5BFC successful


 56%|██████████████████████████████████████████████▏                                   | 18/32 [13:05<10:11, 43.70s/it]

37i9dQZF1DZ06evO47cwRq successful


 59%|████████████████████████████████████████████████▋                                 | 19/32 [13:45<09:13, 42.60s/it]

37i9dQZF1DZ06evO0jjjFK successful


 62%|███████████████████████████████████████████████████▎                              | 20/32 [14:15<07:44, 38.72s/it]

37i9dQZF1DZ06evNZY5NHq successful


 66%|█████████████████████████████████████████████████████▊                            | 21/32 [14:57<07:16, 39.67s/it]

37i9dQZF1DZ06evO3Adu8w successful


 69%|████████████████████████████████████████████████████████▍                         | 22/32 [15:40<06:48, 40.86s/it]

37i9dQZF1DX94qaYRnkufr successful


 72%|██████████████████████████████████████████████████████████▉                       | 23/32 [16:44<07:10, 47.81s/it]

37i9dQZF1DZ06evO3wzrW0 successful


 75%|█████████████████████████████████████████████████████████████▌                    | 24/32 [17:25<06:04, 45.56s/it]

37i9dQZF1DZ06evO3M0Fbi successful


 78%|████████████████████████████████████████████████████████████████                  | 25/32 [17:59<04:56, 42.29s/it]

37i9dQZF1DZ06evO3r3n5S successful


 81%|██████████████████████████████████████████████████████████████████▋               | 26/32 [18:41<04:12, 42.14s/it]

37i9dQZF1DZ06evO3by276 successful


 84%|█████████████████████████████████████████████████████████████████████▏            | 27/32 [19:20<03:26, 41.24s/it]

37i9dQZF1DZ06evO4mwRSU successful


 88%|███████████████████████████████████████████████████████████████████████▊          | 28/32 [20:03<02:47, 41.76s/it]

37i9dQZF1DX5KpP2LN299J successful


 91%|██████████████████████████████████████████████████████████████████████████▎       | 29/32 [21:13<02:30, 50.15s/it]

37i9dQZF1DZ06evO2k3tf2 successful


 94%|████████████████████████████████████████████████████████████████████████████▉     | 30/32 [21:45<01:29, 44.61s/it]

37i9dQZF1DX6bnzK9KPvrz successful


 97%|███████████████████████████████████████████████████████████████████████████████▍  | 31/32 [22:30<00:44, 44.84s/it]

37i9dQZF1DZ06evO3x2k4o successful


100%|██████████████████████████████████████████████████████████████████████████████████| 32/32 [23:01<00:00, 43.17s/it]


In [7]:
df_artists.shape

(1669, 21)

In [8]:
# Droping duplicate songs
df_artists.drop_duplicates(inplace=True)
df_artists.shape

(1644, 21)

In [9]:
df_artists.to_csv("artist_music.csv")