In [None]:
import numpy as np
import string
from typing import List

# boring string formatting stuff ;)
punctuation_set = set(string.punctuation)
def strip_punctuation(s):
    return ''.join(ch for ch in s if ch not in punctuation_set)

In [None]:
# returns an array of vectors, with each vector representing a given document
# also returns corresponding words per row
def create_tfidf_for_documents(documents: List[str]):

    # array of array of words in a given doc
    # make sure we sanitize words by removing uppercase & punctuation
    doc_word_lists = [strip_punctuation(doc.lower()).split(' ') for doc in documents]

    # gets us unique words from within our docs
    unique_words = [*{word for doc_word_list in doc_word_lists for word in doc_word_list}]
    word_to_index = {word: index for index, word in enumerate(unique_words)}

    # want a row for each word, col for each doc
    num_words = len(unique_words)
    num_docs = len(documents)

    tf_mat = np.zeros((num_words, num_docs))
    # idf is global across documents
    idf_mat = np.zeros(num_words)

    for doc_index, doc_word_list in enumerate(doc_word_lists):
        seen_words = set()

        # count word instances for tf
        for word in doc_word_list:
            word_index = word_to_index[word]
            tf_mat[word_index][doc_index] += 1

            # don't want to double count IDF in the same document
            if word not in seen_words:
                idf_mat[word_index] += 1
                seen_words.add(word)

        # normalize TF by word count
        tf_mat[:, doc_index] /= len(doc_word_list)

    # calculate log of idf once we've calculated
    for word_index, idf in enumerate(idf_mat):
        idf_mat[word_index] = np.log10(num_docs / idf)


    # row-wise multiplication within idf_mat!
    tfidf_mat = tf_mat * idf_mat[:, np.newaxis]

    return (tfidf_mat, unique_words)



In [None]:
docs = [
    "My dog is named Fido. Fido likes to bark and runs fast.",
    "My cat is named Puru. Puru likes to purr and is fluffy.",
    "My puppy is cute and can bark loudly",
]

mat, mat_words = create_tfidf_for_documents(docs)

for (index, word) in enumerate(mat_words):
    print(word, "\t", mat[index])

In [None]:
print("dog vs puppy is {}".format(mat[:,0].dot(mat[:,2])))
print("cat vs puppy is {}".format(mat[:,1].dot(mat[:,2])))

In [None]:
from scrape import SpotifyScraper

# connect to spotify API
scraper = SpotifyScraper()
scraper.authenticate()


In [None]:
# this is vampire weekend, but you can find other album IDs by
# just going to spotify, copying the share link, and extracting the deets from there!
artistId = '5BvJzeQpmsdsFp4HGUYUEx'
discography, trackToName, albumToName = scraper.getArtistDiscography(artistId=artistId)
print(discography)


In [None]:
import json
readable_discography = {albumToName[albumId]:[trackToName[track] for track in tracks] for (albumId, tracks) in discography.items()}
print(json.dumps(readable_discography, indent=4))


In [None]:
runningPlaylistResults = {}

# set of all of our tracks!
# used to figure out if a playlist has relevant songs
tracks = {song for songs in discography.values() for song in songs}
try:
    for i in range(4):
        for genre in ["indie", "indie rock", "indie pop", "2000s indie", "indie punk"]:
            # this will query for playlists with names similar to the above phrases
            # ensures that each playlist has at least two tracks from our list
            # and adds the results to runningPlaylistResults
            runningPlaylistResults = scraper.playlistQuery(genre, tracks, runningPlaylistResults)
except:
    print("Uh oh, oopsie while scraping, probably got rate limited!")


In [None]:
# print({k:{**v, 'tracks': [trackToName[t] for t in v['tracks']]} for k,v in runningPlaylistResults.items()})
print(json.dumps([[trackToName[t] for t in v['tracks']] for k,v in runningPlaylistResults.items()], indent=4))