## Imports & Spotify authentication

In [None]:
import os
import pickle

from math import ceil

import pandas as pd
import spotipy

from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()

auth_manager = SpotifyClientCredentials(
  client_id=os.environ.get("SPOTIFY_CLIENT_ID"),
  client_secret=os.environ.get("SPOTIFY_CLIENT_SECRET")
)

sp = spotipy.Spotify(auth_manager=auth_manager)


## Step #1: Retrieve 1000 playlists from a specific search keyword

In [None]:
playlists = dict()

# {
#   "4ETxM0kkS3RMPveAijToRh": {
#     "id": "4ETxM0kkS3RMPveAijToRh",
#     "keywords": {
#       "workout": 12, # <<< associated rank in search result
#       "yoga": 63
#     },
#     "name": "The playlist name",
#     "description": "The playlist description"
#   }
# }


In [None]:
keyword = "" # cooking, running, shower, yoga


In [None]:
# We get 50 results per page, so we need 20 pages to get 1000 playlists
for page in range(20):
  print(f"Keyword \"{keyword}\" | Page {page}")

  search_result = sp.search(
    q=keyword,
    limit=50,
    offset=page*50,
    type="playlist",
    # market="FR"
  )

  for idx, playlist in enumerate(search_result["playlists"]["items"]):
    id = playlist["id"]
    rank = idx + page*50
    
    if id in playlists.keys():
      playlists[id]["keywords"][keyword] = rank

    else:
      playlists[id] = {
        "id": id,
        "name": playlist["name"],
        "description": playlist["description"],
        "keywords": {
          keyword: rank
        }
      }


### Build the most relevant keyword as an attribute

In [None]:
for id, value in playlists.items():
  best_keyword = min(value["keywords"], key=value["keywords"].get)

  value["best_kw"] = best_keyword
  value["best_kw_rank"] = value["keywords"][best_keyword]


### Store playlists on disk, as `dict` as well as `DataFrame`

In [None]:
playlists_df = pd.DataFrame(playlists).T.reset_index(drop=True)

with open('data/playlists_dict.pkl', 'wb') as f:
  pickle.dump(playlists, f)

playlists_df.to_pickle("data/playlists_df.pkl.gz", compression="gzip")


## Step #2: Retrieve tracks from the playlists we have (⚠️ very long)

We use the `playlists_df` to get their IDs.  
If you load the playlists from a past computation, uncomment the following cell:

In [None]:
# playlists_df = pd.read_pickle("data/playlists_df.pkl.gz")


In [None]:
tracks = dict()

# {
#   "7EW6TtHJIa2zyydF6dwNBs": {
#     "id": "7EW6TtHJIa2zyydF6dwNBs",
#     "name": "Track title",
#     "artist": "Justin Bieber &&& Rihanna",
#     "genres": "Rock", # TODO Ex: Pop &&& Rock &&& Variété
#     "popularity": 45,
#     "playlist_ids": []
#   }
# }


### Build a dictionnary of track IDs from the playlists

In [None]:
def retrieve_tracks_from_playlist(playlist_id, n_page, all_tracks):
  print(f"Playlist \"{playlist_id}\" | Page {n_page+1}")
  
  try:
    playlist_tracks = sp.playlist_tracks(
      playlist_id=playlist_id,
      offset=n_page*100,
      # market="FR"
    )
  except:
    return all_tracks, 0

  for track in playlist_tracks["items"]:
    # On rare cases, the track is empty
    if track["track"] is None:
      print(track)
      continue

    id = track["track"]["id"]
    
    # If the track already exists, we just append the current playlist ID to the playlist_ids attribute
    if id in all_tracks.keys():
      all_tracks[id]["playlist_ids"].append(playlist_id)

    # If the track does not exist, we build it
    else:
      all_tracks[id] = {
        "id": id,
        "name": track["track"]["name"],
        "artists": " &&& ".join([artist["name"] for artist in track["track"]["artists"]]),
        "popularity": track["track"]["popularity"],
        "playlist_ids": [playlist_id]
      }
  
  return all_tracks, playlist_tracks["total"]


In [None]:
for playlist_id in playlists_df["id"]:
  tracks, total_tracks = retrieve_tracks_from_playlist(playlist_id, 0, tracks)

  # If the playlist contains more than 100 tracks, it's paginated.
  # So we need to request the remaining pages the same way.
  if total_tracks > 100:
    total_pages = ceil(total_tracks / 100)
    
    for page in range(1, total_pages):
      tracks, _ = retrieve_tracks_from_playlist(playlist_id, page, tracks)


### Store tracks on disk as `dict`

In [None]:
with open('data/tracks_dict.pkl', 'wb') as f:
  pickle.dump(tracks, f)


## Step #3: Retrieve audio features from all the tracks we have (by batch)

We use the `tracks_dict` to get their IDs.  
If you load the tracks from a past computation, uncomment the following cell:

In [None]:
# with open('data/tracks_dict.pkl', 'rb') as f:
#   tracks = pickle.load(f)


In [None]:
track_ids = [key for key in tracks.keys() if key is not None]

# Build batches of 100 tracks
tracks_batches = [track_ids[i:i+100] for i in range(0, len(track_ids), 100)]

# Retrieve audio features in one list
audio_features = []

for batch in tracks_batches:
  audio_features += sp.audio_features(tracks=batch)


### Store audio features on disk, as `dict` as well as `DataFrame`

In [None]:
audio_features = [x for x in audio_features if x is not None]
audio_features_df = pd.DataFrame(audio_features)

with open('data/audio_features_dict.pkl', 'wb') as f:
  pickle.dump(audio_features, f)

audio_features_df.to_pickle("data/audio_features_df.pkl.gz", compression="gzip")
