# Fetching Spotify Playlists

In [1]:
import spotipy
from spotipy import util
from spotipy.client import SpotifyException
from collections import Counter
from gensim.utils import tokenize
import time
from tqdm.notebook import tqdm
import json
import os
import sqlite3

## Connect to the Spotify API

In [2]:
CLIENT_ID = '39c713d6bf4647dba4343ce90baa3413'
CLIENT_SECRET = 'c0a064baba7d4e888966571e3a9d4b9d'
USER_ID = '22molgh77xg6wc4mdb4vle7ay'
URI = 'http://127.0.0.1:8889/callback'

In [3]:
def get_spotify_token(username, client_id, client_secret, redirect_uri):
    try:
        token = util.prompt_for_user_token(
            username=username,
            scope='user-library-read',
            client_id=client_id,
            client_secret=client_secret,
            redirect_uri=redirect_uri,
        )
    except:
        os.remove(f'.cache-{username}')
        token = util.prompt_for_user_token(
            username=username,
            scope='user-library-read',
            client_id=client_id,
            client_secret=client_secret,
            redirect_uri=redirect_uri,
        )
        
    return token

In [4]:
spotify_token = get_spotify_token(USER_ID, CLIENT_ID, CLIENT_SECRET, URI)
session = spotipy.Spotify(auth=spotify_token)

## Fetch playlists by common words

In [5]:
def find_playlists(session, w, max_count=5000):
    try:
        res = session.search(w, limit=50, type='playlist')
        while res:
            for playlist in res['playlists']['items']:
                yield playlist
                max_count -= 1
                if max_count == 0:
                    return
            tries = 3
            while tries > 0:
                try:
                    res = session.next(res['playlists'])
                    tries = 0
                except SpotifyException as e:
                    tries -= 1
                    time.sleep(0.2)
                    if tries == 0:
                        raise
    except SpotifyException as e:
        status = e.http_status
        if status == 404:
            return
        raise

In [6]:
next(find_playlists(session, 'summer'))

{'collaborative': False,
 'description': '',
 'external_urls': {'spotify': 'https://open.spotify.com/playlist/4hBOn8ZGSnvOPjS4IATrWY'},
 'href': 'https://api.spotify.com/v1/playlists/4hBOn8ZGSnvOPjS4IATrWY',
 'id': '4hBOn8ZGSnvOPjS4IATrWY',
 'images': [{'height': 640,
   'url': 'https://mosaic.scdn.co/640/ab67616d0000b27365bd6926749cb73b572e4e63ab67616d0000b2736fd1d4267dd3ccb7f8b2de66ab67616d0000b2738fa61ea1950383f2927999a4ab67616d0000b273b1c5b432c7b0ec91e1017353',
   'width': 640},
  {'height': 300,
   'url': 'https://mosaic.scdn.co/300/ab67616d0000b27365bd6926749cb73b572e4e63ab67616d0000b2736fd1d4267dd3ccb7f8b2de66ab67616d0000b2738fa61ea1950383f2927999a4ab67616d0000b273b1c5b432c7b0ec91e1017353',
   'width': 300},
  {'height': 60,
   'url': 'https://mosaic.scdn.co/60/ab67616d0000b27365bd6926749cb73b572e4e63ab67616d0000b2736fd1d4267dd3ccb7f8b2de66ab67616d0000b2738fa61ea1950383f2927999a4ab67616d0000b273b1c5b432c7b0ec91e1017353',
   'width': 60}],
 'name': 'Kygo, FlicFlac, De Hofnar (Sum

In [7]:
word_counts = Counter({'a': 1})
playlists = {}
words_seen = set()
playlists = {}
count = 0
dupes = 0

In [8]:
MAX_NUM_PLAYLISTS = int(2.5e4)

In [9]:
%%time

with tqdm(total=MAX_NUM_PLAYLISTS) as pbar:
    num_playlists = len(playlists)
    while num_playlists < MAX_NUM_PLAYLISTS:
        for word, _ in word_counts.most_common():
            if not word in words_seen:
                words_seen.add(word)
                print('word>', word)
                for playlist in find_playlists(session, word):
                    if playlist['id'] in playlists:
                        dupes += 1
                    elif playlist['name'] and playlist['owner']:
                        playlists[playlist['id']] = {
                          'owner': playlist['owner']['id'],
                          'name': playlist['name'],
                          'id': playlist['id'],
                        }
                        num_playlists += 1
                        if num_playlists > MAX_NUM_PLAYLISTS:
                            pbar.total = num_playlists
                            pbar.refresh()
                        pbar.update(1)
                        for token in tokenize(playlist['name'], lowercase=True):
                            word_counts[token] += 1
                break

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 7.87 µs


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))

word> a
word> is
word> this
word> y
word> and
word> en
word> s
word> songs
word> the
word> of
word> best
word> hits
word> music
word> top
word> playlist
word> pop
word> rock
word> de
word> e
word> soundtrack



## Fetch and save tracks by playlist

In [10]:
def track_yielder(session, playlist):
    res = session.user_playlist_tracks(playlist['owner'], playlist['id'],
              fields='items(track(id, name, artists(name, id), duration_ms)),next')
    while res:
        for track in res['items']:
            if track['track']:
                yield track['track']
        tries = 3
        while tries > 0:
            try:
                res = session.next(res)
                if not res or  not res.get('items'):
                    return
                tries = 0
            except SpotifyException as e:
                if 400 <= e.http_status <= 499:
                    return
                tries -= 1
                time.sleep(1)
                if tries == 0:
                    raise e

In [11]:
# playlists['3ZL3B9QN8N5uSTc7xw0LP7']['name']

In [None]:
SONGS_DB_PATH = 'data/songs.db'
PLAYLISTS_PATH = 'data/playlists.ndjson'
SONG_IDS_PATH = 'data/songs_ids.txt'

In [13]:
%%time

if os.path.isfile(SONGS_DB_PATH):
    os.remove(SONGS_DB_PATH)
conn = sqlite3.connect(SONGS_DB_PATH)
c = conn.cursor()
c.execute('CREATE TABLE songs (id text primary key, name text, artist text)')
c.execute('CREATE INDEX name_idx on songs(name)')

failing_playlist_id = '3ZL3B9QN8N5uSTc7xw0LP7'
skip = False

tracks_seen = set()
with open(PLAYLISTS_PATH, 'w') as fout_playlists:
    with open(SONG_IDS_PATH, 'w') as fout_song_ids:
        for playlist in tqdm(playlists.values()):
            skip &= playlist['id'] != failing_playlist_id
            if skip: continue
            try:
                track_ids = []
                for track in track_yielder(session, playlist):
                    track_id = track['id']
                    if not track_id:
                        continue
                    if not track_id in tracks_seen:
                        c.execute("INSERT INTO songs VALUES (?, ?, ?)", 
                                  (track['id'], track['name'], track['artists'][0]['name']))
                        tracks_seen.add(track_id)
                    track_ids.append(track_id)
                fout_playlists.write(json.dumps(playlist) + '\n')
                fout_song_ids.write(' '.join(track_ids) + '\n')
                conn.commit()
            except Exception as e:
                print(e)
                #if 'expired' in e.msg or 'no token' in e.msg.lower():
                if hasattr(e, 'http_status') and e.http_status == 401:
                    spotify_token = get_spotify_token(USER_ID, CLIENT_ID, CLIENT_SECRET, URI)
                    session = spotipy.Spotify(auth=spotify_token)
                continue
conn.commit()

HBox(children=(FloatProgress(value=0.0, max=25859.0), HTML(value='')))

retrying ...1secs
http status: 401, code:-1 - https://api.spotify.com/v1/users/sthiesenhusen/playlists/3OMVLKaUnIPF5ieWRhE0gV/tracks?limit=100&offset=0&fields=items%28track%28id%2C+name%2C+artists%28name%2C+id%29%2C+duration_ms%29%29%2Cnext:
 The access token expired
http status: 401, code:-1 - https://api.spotify.com/v1/users/veuk23tdrdsul8axmugb6zlh0/playlists/1DlhV7GRg3YSpw8gWzFfke/tracks?limit=100&offset=0&fields=items%28track%28id%2C+name%2C+artists%28name%2C+id%29%2C+duration_ms%29%29%2Cnext:
 The access token expired
http status: 401, code:-1 - https://api.spotify.com/v1/users/gani5k9nl6jc4r65fkurpr52n/playlists/6zlC0yNxedMUUZ8dzZ4yMn/tracks?limit=100&offset=0&fields=items%28track%28id%2C+name%2C+artists%28name%2C+id%29%2C+duration_ms%29%29%2Cnext:
 The access token expired
http status: 401, code:-1 - https://api.spotify.com/v1/users/1110837082/playlists/00PpvaeG1xXISODkgPFkee/tracks?limit=100&offset=0&fields=items%28track%28id%2C+name%2C+artists%28name%2C+id%29%2C+duration_ms%2