## Spotify authentication

In [None]:
import os

import pandas as pd
import pickle
import spotipy

from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials

load_dotenv()

auth_manager = SpotifyClientCredentials(
  client_id=os.environ.get("SPOTIFY_CLIENT_ID"),
  client_secret=os.environ.get("SPOTIFY_CLIENT_SECRET")
)

sp = spotipy.Spotify(auth_manager=auth_manager)


## Step #1: Retrieve 1000 playlists from a specific search keyword

In [None]:
playlists = dict()

# {
#   "4ETxM0kkS3RMPveAijToRh": {
#     "id": "4ETxM0kkS3RMPveAijToRh",
#     "keywords": {
#       "workout": 12, # <<< associated rank in search result
#       "yoga": 63
#     },
#     "name": "The playlist name",
#     "description": "The playlist description"
#   }
# }


In [None]:
keyword = "" # cooking, running, shower, yoga


In [None]:
# We get 50 results per page, so we need 20 pages to get 1000 playlists
for page in range(20):
  print(f"Keyword \"{keyword}\" | Page {page}")

  search_result = sp.search(
    q=keyword,
    limit=50,
    offset=page*50,
    type="playlist",
    market="FR"
  )

  for idx, playlist in enumerate(search_result["playlists"]["items"]):
    id = playlist["id"]
    rank = idx + page*50
    
    if id in playlists.keys():
      playlists[id]["keywords"][keyword] = rank

    else:
      playlists[id] = {
        "id": id,
        "name": playlist["name"],
        "description": playlist["description"],
        "keywords": {
          keyword: rank
        }
      }


### Store the playlists on disk

#### Store the most relevant keyword as an attribute

In [None]:
for id, value in playlists.items():
  best_keyword = min(value["keywords"], key=value["keywords"].get)

  value["best_kw"] = best_keyword
  value["best_kw_rank"] = value["keywords"][best_keyword]


#### Store _also_ as a DataFrame in a compressed pickle

In [None]:
playlists_df = pd.DataFrame(playlists).T.reset_index(drop=True)


In [None]:
with open('data/playlists_dict.pkl', 'wb') as handle:
  pickle.dump(playlists, handle)

playlists_df.to_pickle("data/playlists_df.pkl.gz", compression="gzip")


## Step #2: Retrieve tracks from the playlists we have

In [None]:
tracks = dict()

# {
#   "7EW6TtHJIa2zyydF6dwNBs": {
#     "id": "7EW6TtHJIa2zyydF6dwNBs",
#     "name": "Track title",
#     "artist": "Track artist", # TODO (handle the case of multiple artists)
#     "genres": "Rock,Pop", # TODO
#     "popularity": 45,
#     "playlist_ids": []
#   }
# }

In [None]:
from math import ceil

for playlist_id in playlists.keys():
  print(f"Playlist \"{playlist_id}\" | Page 1")

  playlist_tracks = sp.playlist_tracks(
    playlist_id=playlist_id
  )

  for track in playlist_tracks["items"]:
    # On rare cases, the track is empty
    if track["track"] is None:
      print(track)
      continue

    id = track["track"]["id"]
    
    # If the track already exists, we just append the current playlist ID to the playlist_ids attribute
    if id in tracks.keys():
      tracks[id]["playlist_ids"].append(playlist_id)

    # If the track does not exist, we build it
    else:
      tracks[id] = {
        "id": id,
        "name": track["track"]["name"],
        # "artist": artist, # TODO
        "popularity": track["track"]["popularity"],
        "playlist_ids": [playlist_id]
      }

  # If the playlist contains more than 100 tracks, it's paginated.
  # So we need to request the remaining pages the same way.
  if playlist_tracks["total"] > 100:
    total_pages = ceil(playlist_tracks["total"] / 100)
    
    for page in range(1, total_pages):
      print(f"Playlist \"{playlist_id}\" | Page {page+1}")

      playlist_tracks = sp.playlist_tracks(
        playlist_id=playlist_id,
        offset=page*100
      )

      for track in playlist_tracks["items"]:
        if track["track"] is None:
          print(track)
          continue

        id = track["track"]["id"]
        
        if id in tracks.keys():
          tracks[id]["playlist_ids"].append(playlist_id)

        else:
          tracks[id] = {
            "id": id,
            "name": track["track"]["name"],
            # "artist": artist, # TODO
            "popularity": track["track"]["popularity"],
            "playlist_ids": [playlist_id]
          }


## Step #3: Retrieve audio features from all the tracks we have (by batch)

In [None]:
track_ids = list(tracks.keys())

# Build batches of 100 tracks
tracks_batches = [track_ids[i:i+100] for i in range(0, len(track_ids), 100)]


In [None]:
audio_features = []

for batch in tracks_batches:
  # TODO: Some batches don't work. Why?
  try:
    audio_features += sp.audio_features(tracks=batch)
  except:
    print(batch)


## Step #4: Store the data

In [None]:
# TODO