In [1]:
import numpy as np
import string
from typing import List

# boring string formatting stuff ;)
punctuation_set = set(string.punctuation)
def strip_punctuation(s):
    return ''.join(ch for ch in s if ch not in punctuation_set)

In [2]:
# returns an array of vectors, with each vector representing a given document
# also returns corresponding words per row
def create_tfidf_for_documents(documents: List[str]):

    # array of array of words in a given doc
    # make sure we sanitize words by removing uppercase & punctuation
    doc_word_lists = [strip_punctuation(doc.lower()).split(' ') for doc in documents]

    # gets us unique words from within our docs
    unique_words = [*{word for doc_word_list in doc_word_lists for word in doc_word_list}]
    word_to_index = {word: index for index, word in enumerate(unique_words)}

    # want a row for each word, col for each doc
    num_words = len(unique_words)
    num_docs = len(documents)

    tf_mat = np.zeros((num_words, num_docs))
    # idf is global across documents
    idf_mat = np.zeros(num_words)

    for doc_index, doc_word_list in enumerate(doc_word_lists):
        seen_words = set()

        # count word instances for tf
        for word in doc_word_list:
            word_index = word_to_index[word]
            tf_mat[word_index][doc_index] += 1

            # don't want to double count IDF in the same document
            if word not in seen_words:
                idf_mat[word_index] += 1
                seen_words.add(word)

        # normalize TF by word count
        tf_mat[:, doc_index] /= len(doc_word_list)

    # calculate log of idf once we've calculated
    for word_index, idf in enumerate(idf_mat):
        idf_mat[word_index] = np.log10(num_docs / idf)


    # row-wise multiplication within idf_mat!
    tfidf_mat = tf_mat * idf_mat[:, np.newaxis]

    return (tfidf_mat, unique_words)



In [3]:
docs = [
    "My dog is named Fido. Fido likes to bark and runs fast.",
    "My cat is named Puru. Puru likes to purr and is fluffy.",
    "My puppy is cute and can bark loudly",
]

mat, mat_words = create_tfidf_for_documents(docs)

for (index, word) in enumerate(mat_words):
    print(word, "\t", mat[index])

bark 	 [0.01467427 0.         0.02201141]
is 	 [0. 0. 0.]
puppy 	 [0.         0.         0.05964016]
can 	 [0.         0.         0.05964016]
purr 	 [0.        0.0397601 0.       ]
dog 	 [0.0397601 0.        0.       ]
named 	 [0.01467427 0.01467427 0.        ]
to 	 [0.01467427 0.01467427 0.        ]
my 	 [0. 0. 0.]
cute 	 [0.         0.         0.05964016]
likes 	 [0.01467427 0.01467427 0.        ]
cat 	 [0.        0.0397601 0.       ]
and 	 [0. 0. 0.]
puru 	 [0.         0.07952021 0.        ]
loudly 	 [0.         0.         0.05964016]
fast 	 [0.0397601 0.        0.       ]
fluffy 	 [0.        0.0397601 0.       ]
fido 	 [0.07952021 0.         0.        ]
runs 	 [0.0397601 0.        0.       ]


In [4]:
print("dog vs puppy is {}".format(mat[:,0].dot(mat[:,2])))
print("cat vs puppy is {}".format(mat[:,1].dot(mat[:,2])))

dog vs puppy is 0.00032300136995640666
cat vs puppy is 0.0


In [5]:
from scrape import SpotifyScraper

# connect to spotify API
scraper = SpotifyScraper()
scraper.authenticate()


Successfully authenticated


In [12]:
# this is vampire weekend, but you can find other album IDs by
# just going to spotify, copying the share link, and extracting the deets from there!
artistId = '5BvJzeQpmsdsFp4HGUYUEx'
discography, trackToName, albumToName = scraper.getArtistDiscography(artistId=artistId)
print(discography)


{'5WWu3iYAXLgqghjU9696Nk': {'5gidPD39f4Byrnt3Vrhdpj', '2uERPFSsDSlaXojdVqK2kj', '4IA82QQCoPtGN9xDHflcZA', '6ZlRHL9KmkWpezz7DkLbf1', '2psTJZYW6M9xP54ffEsChA', '4cih4QDlMIZ882wNQUhIlS', '0txKAVGacfraQqJWpjALTp', '56sTbfLiRzNIzwpEYkV4mr', '2zqCw4g4w9ZLzwHtw1I7e9', '6EPwUdSSi8ExTZvoR7ITg3', '14C0Ix91OXbQtAiQNq4FYF', '3G9ZfSWFKqtkTA1BlUVF31', '6QpHEf0YSddrcKnV1U5wKM', '7xFYOlTj6m05q2CgyxLXST', '1syUUTdDrjijrA3aqLxrBg', '55wBlx4cbmvYGqfVbKMBW4', '3Cnp5gloHCyXZsMimaHKTt', '12Fpw64lbhzYT5pRSleG4V', '2Z30tAv32BImOPtlyFzFy0', '6Xry4UUgAfbZTAynZl2Ou3', '3hSoQunmNwmgjt6RIKuUsJ'}, '1A3nVEWRJ8yvlPzawHI1pQ': {'5c2fEICoqVYuTjAGEieB2F', '2WaaqcSdsLliuWTnmY4rLC', '15E2P2ToSLNnZvcOEQra6a', '4dRqYKhLVujxiBXcq50YzG', '528bxUV84T4T8HHBFSpcaV', '1ze0Z3ZnIDITxj13NRok4z', '5jd6XPyXSY4jQVG3SlXP1b', '7lQgoAWAFAo0XW7dW2TL1y', '3ujuDsiyBLAXAB6dtNwpGu', '1MkiAWVcbxZ7NzMuAuWqJS', '2G5ZpNvgSOOcBCpLcOfvJy', '3VORkvig6GJheAu4x3I3oC', '39exKIvycQDgs4T6uXdyu0', '2YegRTu9IjiUg4VWfnXyM3', '4Ww4jdVXTyvSnryNTtReRJ', '6UryEVk

In [13]:
import json
readable_discography = {albumToName[albumId]:[trackToName[track] for track in tracks] for (albumId, tracks) in discography.items()}
print(json.dumps(readable_discography, indent=4))


{
    "Father of the Bride": [
        "Stranger",
        "Rich Man",
        "Hold You Now (feat. Danielle Haim)",
        "This Life",
        "Bambina",
        "Spring Snow",
        "How Long?",
        "Jerusalem, New York, Berlin",
        "Flower Moon (feat. Steve Lacy)",
        "We Belong Together (feat. Danielle Haim)",
        "My Mistake",
        "Big Blue",
        "Harmony Hall",
        "Sympathy",
        "Married in a Gold Rush (feat. Danielle Haim)",
        "2021",
        "Sunflower (feat. Steve Lacy)",
        "Unbearably White"
    ],
    "Modern Vampires of the City": [
        "Everlasting Arms",
        "Unbelievers",
        "Hudson",
        "Worship You",
        "Hannah Hunt",
        "Obvious Bicycle",
        "Don't Lie",
        "Ya Hey",
        "Young Lion",
        "Step",
        "Ya Hey - 'Paranoid Styles' Mix",
        "Unbelievers - 'Seeburg Drum Machine' Mix",
        "Finger Back",
        "Diane Young"
    ],
    "Contra": [
        "Diploma

In [14]:
runningPlaylistResults = {}

# set of all of our tracks!
# used to figure out if a playlist has relevant songs
tracks = {song for songs in discography.values() for song in songs}
try:
    for i in range(4):
        for genre in ["indie", "indie rock", "indie pop", "2000s indie", "indie punk"]:
            # this will query for playlists with names similar to the above phrases
            # ensures that each playlist has at least two tracks from our list
            # and adds the results to runningPlaylistResults
            runningPlaylistResults = scraper.playlistQuery(genre, tracks, runningPlaylistResults)
except:
    print("Uh oh, oopsie while scraping, probably got rate limited!")


.+.+............................................................................................................................................................................................................................................................................................................................+.....................................................................................................................................................+..................................................................+.................................................................................................................................+......................................................................+..............................................................................................................+......................................+..+.......+..+.......+.........+...................................+..........................{'37i9dQZF1DWTc5Q

In [18]:
# print({k:{**v, 'tracks': [trackToName[t] for t in v['tracks']]} for k,v in runningPlaylistResults.items()})
print(json.dumps([[trackToName[t] for t in v['tracks']] for k,v in runningPlaylistResults.items()], indent=4))

[
    [
        "Sunflower (feat. Steve Lacy)",
        "Harmony Hall"
    ],
    [
        "A-Punk",
        "Mansard Roof",
        "Oxford Comma",
        "Cape Cod Kwassa Kwassa",
        "The Kids Don't Stand a Chance"
    ],
    [
        "A-Punk",
        "Diane Young"
    ],
    [
        "Walcott",
        "Harmony Hall"
    ],
    [
        "The Kids Don't Stand a Chance",
        "A-Punk",
        "Campus",
        "This Life"
    ],
    [
        "Campus",
        "A-Punk"
    ],
    [
        "Walcott",
        "The Kids Don't Stand a Chance",
        "Oxford Comma"
    ],
    [
        "Cape Cod Kwassa Kwassa",
        "Oxford Comma"
    ],
    [
        "Sunflower (feat. Steve Lacy)",
        "Unbearably White",
        "This Life",
        "Harmony Hall"
    ],
    [
        "Harmony Hall",
        "This Life"
    ],
    [
        "Hannah Hunt",
        "Step",
        "Diane Young",
        "Unbelievers"
    ],
    [
        "A-Punk",
        "Cousins"
    ],
    [
   