## Importing the libraries

In [1]:
import os
import pandas as pd
import numpy as np
import json
import yaml
import re
from tqdm import tqdm
# import multiprocessing as mp
import time
# import random
import datetime
import spotipy
# import spotipy.oauth2 as oauth2
from spotipy.oauth2 import SpotifyOAuth,SpotifyClientCredentials

In [2]:
stream= open("spotify/spotify.yaml")
spotify_details = yaml.safe_load(stream)
auth_manager = SpotifyClientCredentials(client_id=spotify_details['Client_id'],
                                        client_secret=spotify_details['client_secret'])
sp = spotipy.client.Spotify(auth_manager=auth_manager)

## Getting relevant info from dataset

Data in  the 1_Million_Playlists is in json files with each json file having 1000 playlists.
We process it to get the relevant data in csv file.
- we require unique track uri string for extracting the data with the spotify client

In [8]:
def get_unique_tracks(dataset_path, num_slices=10):
  count = 0
  playlist_count = 0
  mpd_playlists = []
  unique_tracks= pd.DataFrame()
  filenames = os.listdir(dataset_path)
  all_temp=[]
  for fname in tqdm(sorted(filenames, key=len)):
    if fname.startswith("mpd.slice.") and fname.endswith(".json"):
      count += 1
      fullpath = os.sep.join((dataset_path, fname))
      f = open(fullpath)
      current_slice = json.load(f)
      f.close()
      # Create a list of all playlists
      for playlist in current_slice['playlists']:
        playlist_count +=1
        mpd_playlists.append(playlist)
        if playlist_count == 1000:
          playlist_count=0
          temp=pd.DataFrame(mpd_playlists)
          temp=temp.explode('tracks')
          temp=pd.DataFrame(temp['tracks'].apply(pd.Series))
          mpd_playlists = []
          all_temp.append(temp)
      if count == num_slices:
        print("Done with json.Breaking loop\n")
        break
  print("Joining")
  unique_tracks = pd.concat(all_temp, ignore_index=True)
  print("Removing duplicates")
  unique_tracks = unique_tracks.drop_duplicates(subset=["track_uri"])
  return unique_tracks
# Path where the json files are extracted
dataset_path = 'dataset_200/'

In [9]:
df = get_unique_tracks(dataset_path, num_slices=30)
df.to_csv('data/unique_tracks_mpd.csv')

  0%|          | 0/200 [00:00<?, ?it/s]

 14%|█▍        | 29/200 [10:44<1:03:21, 22.23s/it]

Done with json.Breaking loop

Joining





Removing duplicates


In [5]:
df.to_parquet('data/unique_tracks_mpd_parquet.parquet')

NameError: name 'df' is not defined

In [None]:
df["track_uri"] = df["track_uri"].apply(lambda x: re.findall(r'\w+$', x)[0])
df["artist_uri"] = df["artist_uri"].apply(lambda x: re.findall(r'\w+$', x)[0])
df["album_uri"] = df["album_uri"].apply(lambda x: re.findall(r'\w+$', x)[0])

In [6]:
df = df[['track_uri','artist_uri','album_uri']]

NameError: name 'df' is not defined

In [14]:
# df.to_csv('data/unique_tracks_mpd_drop.csv')
# df.to_parquet('data/unique_tracks_mpd_drop_parquet.parquet')

## Reading from extracted data

In [3]:
df=pd.read_parquet('data/unique_tracks_mpd_drop_parquet.parquet') 

In [4]:
df.head()

Unnamed: 0,track_uri,artist_uri,album_uri
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K
1,6I9VzXrHxO9rA9A5euc8Ak,26dSoYclwsYLMAKD3tpOr4,0z7pVBGOD7HCIB7S8eLkLI
2,0WqIKmW4BTrj3eJFmnCKMv,6vWDO969PvNqNYHIOW5v0m,25hVFAxTlDvXbx2X2QkUkE
3,1AWQoqb9bSvzTjaLralEkT,31TPClRtHm23RisEBtV3X7,6QPkyl04rXwTGlGlcYaRoW
4,1lzr43nnXAijIGYnCT8M8H,5EvFsr3kj42KNv97ZEnqij,6NmFmPX56pcLBOFMhIiKvF


In [5]:
df.columns

Index(['track_uri', 'artist_uri', 'album_uri'], dtype='object')

In [6]:
t_uri=df["track_uri"].unique()
a_uri=df["artist_uri"].unique()

# Feature extraction

Using the Spotify API for Feature Extraction and Saving Results to a CSV File and Errors to a Log File

## Audio features

In [15]:
f = open('data/audio_features.csv','a')
e=0
for i in tqdm(range(0,200,100)):
    try:
     track_feature = sp.audio_features(t_uri[i:i+100])
     track_df = pd.DataFrame(track_feature)
     if(i==0):
        track_df.to_csv("data/audio_features.csv",mode='a',header=True,index=False)
     else:
        track_df.to_csv("data/audio_features.csv",mode='a',header=False,index=False) 
    #  f.write(csv_data)
    except Exception as error:
        e+=1
        r = open("data/log/audio_features_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("data/log/audio_features_log.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
f.close()

  0%|          | 0/2 [00:00<?, ?it/s]

Max Retries reached
 50%|█████     | 1/2 [00:05<00:05,  5.63s/it]Max Retries reached
100%|██████████| 2/2 [00:10<00:00,  5.48s/it]


In [13]:
csv_data

'0.904,0.813,4,-7.105,0,0.121,0.0311,0.00697,0.0471,0.81,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/audio-analysis/0UaMYEvWZi0ZqiDOoHU3YI,226864,4\r\n0.774,0.838,5,-3.914,0,0.114,0.0249,0.025,0.242,0.924,143.04,audio_features,6I9VzXrHxO9rA9A5euc8Ak,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/audio-analysis/6I9VzXrHxO9rA9A5euc8Ak,198800,4\r\n0.664,0.758,2,-6.583,0,0.21,0.00238,0.0,0.0598,0.701,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/audio-analysis/0WqIKmW4BTrj3eJFmnCKMv,235933,4\r\n0.892,0.714,4,-6.055,0,0.141,0.201,0.000234,0.0521,0.817,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,spotify:track:1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvzT

In [12]:
# f = open('data/audio_features.csv','a')
e=0
for i in tqdm(range(0,len(t_uri),100)):
    try:
     track_feature = sp.audio_features(t_uri[i:i+100])
     track_df = pd.DataFrame(track_feature)
     if(i==0):
        track_df.to_csv("data/audio_features.csv",mode='a',header=True,index=False)
     else:
        track_df.to_csv("data/audio_features.csv",mode='a',header=False,index=False)
     time.sleep(4)
    #  f.write(csv_data)
    except Exception as error:
        e+=1
        r = open("data/log/audio_features_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("data/log/audio_features_log.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
# f.close()

 59%|█████▉    | 2003/3377 [2:23:00<1:37:43,  4.27s/it]Max Retries reached
 59%|█████▉    | 2004/3377 [2:23:05<1:44:46,  4.58s/it]Max Retries reached
 59%|█████▉    | 2005/3377 [2:23:11<1:51:56,  4.90s/it]Max Retries reached
 59%|█████▉    | 2006/3377 [2:23:16<1:54:35,  5.02s/it]Max Retries reached
 59%|█████▉    | 2007/3377 [2:23:21<1:56:30,  5.10s/it]Max Retries reached
 59%|█████▉    | 2008/3377 [2:23:26<1:57:56,  5.17s/it]Max Retries reached
 59%|█████▉    | 2009/3377 [2:23:32<2:00:50,  5.30s/it]Max Retries reached
 60%|█████▉    | 2010/3377 [2:23:38<2:02:56,  5.40s/it]Max Retries reached
 60%|█████▉    | 2011/3377 [2:23:44<2:06:28,  5.56s/it]Max Retries reached
 60%|█████▉    | 2012/3377 [2:23:49<2:06:46,  5.57s/it]Max Retries reached
 60%|█████▉    | 2013/3377 [2:23:55<2:09:04,  5.68s/it]Max Retries reached
 60%|█████▉    | 2014/3377 [2:24:01<2:10:42,  5.75s/it]Max Retries reached
 60%|█████▉    | 2015/3377 [2:24:07<2:11:44,  5.80s/it]Max Retries reached
 60%|█████▉    | 2016/337

KeyboardInterrupt: 

In [None]:
f = open('data/audio_features.csv','a')
e=0
for i in tqdm(range(0,len(t_uri),100)):
    if(i<200200):
       continue
    else:
        try:
            track_feature = sp.audio_features(remaining_t_uri[i:i+100])
            track_df = pd.DataFrame(track_feature)
            csv_data = track_df.to_csv(header=False,index=False)
            f.write(csv_data)
        except Exception as error:
            e+=1
            r = open("data/log/audio_features_log.txt", "a")
            r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
            r.close()
            time.sleep(3)
            continue
r = open("data/log/audio_features_log.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
f.close()

## Track Features

In [23]:
audio_feat_df_till_now=pd.read_csv("data/audio_features.csv")
track_feat_df_till_now=pd.read_csv("data/track_features.csv")
# track_features = sp.tracks(t_uri[50:100])

In [32]:
trackdf_t_uri_till_now=np.array(track_feat_df_till_now.track_uri)

In [36]:
audiodf_t_uri_till_now=np.array(audio_feat_df_till_now.uri.apply(lambda x:x.split(":")[2]))
audiodf_t_uri_till_now

array(['0UaMYEvWZi0ZqiDOoHU3YI', '6I9VzXrHxO9rA9A5euc8Ak',
       '0WqIKmW4BTrj3eJFmnCKMv', ..., '2PTvM4E7vKBfS6BQ5rOhjp',
       '7ygYTTGQA91BibWe5n24Ru', '2SMWGgAT5C6IghiEiguY7I'], dtype=object)

In [59]:
# intersect = audiodf_t_uri_till_now[np.in1d(audiodf_t_uri_till_now, trackdf_t_uri_till_now)]
# mask1 = np.searchsorted(audiodf_t_uri_till_now, intersect)
# remaining_t_uri=np.delete(audiodf_t_uri_till_now,mask1)
remaining_t_uri = np.setdiff1d(audiodf_t_uri_till_now, trackdf_t_uri_till_now)

In [62]:
# f = open('data/track_features.csv','a')
e=0
track_pop_df = pd.DataFrame(columns=['track_uri', 'release_date', 'pop'])
for i in tqdm(range(0,len(remaining_t_uri),50)):
    try:
        track_features = sp.tracks(remaining_t_uri[i:i+50])
        for x in range(50):
            track_pop_df_new_row = pd.DataFrame({
                'track_uri': remaining_t_uri[i+x],
                'release_date': track_features['tracks'][x]['album']['release_date'],
                'pop': track_features['tracks'][x]["popularity"]
            },index=[0])
            track_pop_df = pd.concat([track_pop_df, track_pop_df_new_row], ignore_index=True)
            # track_pop=pd.DataFrame([t_uri[i+x]])
            # track_pop['release_date']=track_features['tracks'][x]['album']['release_date']
            # track_pop['pop'] = track_features['tracks'][x]["popularity"]
        if(i==0):
            track_pop_df.to_csv("data/track_features.csv",mode='a',header=True,index=False)
        else:
            track_pop_df.to_csv("data/track_features.csv",mode='a',header=False,index=False) 
            # csv_data = track_pop.to_csv(header=False,index=False)
            # f.write(csv_data)
        track_pop_df = pd.DataFrame(columns=['track_uri', 'release_date', 'pop'])
        time.sleep(5)
    except Exception as error:
        e+=1
        r = open("data/log/track_features_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("data/log/track_features_log.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
# f.close()

  0%|          | 0/3100 [00:00<?, ?it/s]

100%|██████████| 3100/3100 [4:39:20<00:00,  5.41s/it]  


In [13]:
track_pop_df_new_row

Unnamed: 0,track_uri,release_date,pop
0,0TQDrX7JWalZLt2X6E7ZVj,2013-01-01,0


## Artist features

In [12]:
artist_features = sp.artists(a_uri[0:50])

In [11]:
e=0
for i in tqdm(range(0,len(a_uri),50)):
    try:
        artist_features = sp.artists(a_uri[i:i+50])
        for x in range(50):
            artist_df=pd.DataFrame([a_uri[i+x]])
            artist_pop = artist_features['artists'][x]["popularity"]
            artist_genres = artist_features['artists'][x]["genres"]
            artist_df["artist_pop"] = artist_pop
            if artist_genres: 
                artist_df["genres"] = " ".join([re.sub(' ','_',i) for i in artist_genres])
            else:
              artist_df["genres"] = "unknown"
            if(i==0 and x==0):
                artist_df.to_csv("data/artist_features.csv",mode='a',header=True,index=False)
            else:
                artist_df.to_csv("data/artist_features.csv",mode='a',header=False,index=False) 
            # csv_data = artist_df.to_csv(header=False,index=False)
            # f.write(csv_data)
        time.sleep(3)    
    except Exception as error:
        e+=1
        r = open("data/log/artist_features_log.txt", "a")
        r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+": "+str(error)+'\n')
        r.close()
        time.sleep(3)
        continue
r = open("data/log/artist_features_log.txt", "a")
r.write(datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S")+" _________________________ "+"Total Number Of Errors : "+str(e)+" _________________________ "+'\n')
r.close()
# f.close()

100%|██████████| 1254/1254 [1:11:39<00:00,  3.43s/it]


In [10]:
artist_df

Unnamed: 0,0,artist_pop,genres
0,4OrizGCKhOrW6iDDJHN9xd,48,brooklyn_indie dream_pop dreamo indie_rock ind...
