In [1]:
import json
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
import scipy.sparse as sp
import pandas as pd
import time
%matplotlib inline

# Helper Functions
- These are not project task specific, generic, utility functions.

In [2]:
def keysofdict(d):
    rv = []
    for k in d.keys():
        rv.append(k)
    return rv

def lookup(d, l):
    rv = []
    for e in l:
        rv.append(d[e])
    return rv

def index2binary(indices, size):
    rv = np.zeros([size,1])
    rv[indices] = 1
    return rv

# 1. Read JSON in to variable
- Provide the path to main directory containing all songs

In [189]:
lastfmdir = '/Users/oeken/Downloads/lastfm_subset/'
jsons = []

for root, dirs, files in os.walk(lastfmdir):
    jsons += [root + '/' + s for s in files]
    

print('How many songs?',len(jsons))

How many songs? 9330


In [191]:
%%time
# Read the jsons to the variable data
data = []
for jsondir in jsons:
    with open(jsondir,'r') as f:
        jsonstr = f.read()
        jsondict = json.loads(jsonstr)
        data.append(jsondict)

CPU times: user 2.31 s, sys: 1.05 s, total: 3.37 s
Wall time: 5.59 s


# 2. Assign Song IDs

In [195]:
track2id = {}  # mapping: track string --> Track ID
track2info = {}  # mapping: track string --> Track info
id2track = {}  # mapping: Track ID --> track string


INDEX = 0
for jsondict in data:
    ar = jsondict['artist']
    ti = jsondict['title']
    tr = jsondict['track_id']
    if not tr in track2id:
        track2id[tr] = INDEX
        track2info[tr] = [ar, ti]
        id2track[INDEX] = tr
        INDEX += 1

print('How many are songs assigned an ID?',INDEX)

How many are songs assigned an ID? 9330


In [198]:
# Test to see if we can fetch information we need
for i in range(5):    
    print(i, id2track[i], track2info[id2track[i]])

0 TRAAAAW128F429D538 ['Casual', "I Didn't Mean To"]
1 TRAAABD128F429CF47 ['The Box Tops', 'Soul Deep']
2 TRAAADZ128F9348C2E ['La Sonora Santanera', 'Amor De Cabaret']
3 TRAAAEF128F4273421 ['Adam Ant', 'Something Girls']
4 TRAAAFD128F92F423A ['Gob', 'Face the Ashes']


# 3. Eliminate Unknown + Dissimilar ( $\theta$ ) Songs 
- Here set the parameter $\theta$
- An unknown song is the song that does not an associated file with it in the given directory
- Dissimilar song is the song that that not satisfy the thresholding ( $\theta$ ) rule 

In [206]:
theta = 0
nomc = 0
discarded_songs = 0 #missing_count
for jsondict in data:
    distilled_similars = []
    for similartrack in jsondict['similars']:        
        tr = similartrack[0]
        sim = similartrack[1]
        if tr in track2id and sim >= theta:  
            distilled_similars.append(similartrack)
            nomc += 1
        else:
            discarded_songs += 1
    jsondict['similars'] = distilled_similars


In [207]:
print('How many songs are discarded?',discarded_songs)

How many songs are discarded? 0


# 4. Eliminate Weak  ($\gamma$) Tags and Assign IDs
- Here set the parameter $\gamma$

In [210]:
gamma = 0

tag2id = {}  # Mapping: tag name --> Tag ID
id2tag = {}  # Mapping: tag ID --> tag name

discarded_tags = 0
INDEX_TAG = 0
for jsondict in data:
    distilled_tags = []
    for tag in jsondict['tags']:                
        if int(tag[1]) >= gamma:            
            distilled_tags.append(tag)
            if not tag[0] in tag2id:  
                tag2id[tag[0]] = INDEX_TAG
                id2tag[INDEX_TAG] = tag[0]
                INDEX_TAG += 1
            else:
                discarded_tags += 1    
    jsondict['tags'] = distilled_tags


In [209]:
print('How many tags are kept?',INDEX_TAG)

How many tags kept? 33355


In [211]:
print('How many tags are discarded?',discarded_tags)

How many tags are discarded? 65943


**Example: ** *Tags of the first song after discarding the weak tags*

In [16]:
data[0]['tags']

[['Bay Area', '100'],
 ['hieroglyiphics', '100'],
 ['classic', '50'],
 ['Hip-Hop', '50'],
 ['stream', '50'],
 ['OG', '50'],
 ['1979-2006: A Hip-Hop Odyssey - 800 Tracks In A 48 Minute Mix', '50'],
 ['heiroglyphics', '50'],
 ['oaksterdamn', '50'],
 ['heard on Pandora', '0']]

# 5. Form Song Id - Track Id Dictionaries

In [212]:
trackid2tagid = {}  # mapping: track ID --> tag ID
tagid2trackid = {}  # mapping: tag ID --> track ID

for jsondict in data:
    track = jsondict['track_id']
    trackid = track2id[track]
    if not trackid in trackid2tagid:
        trackid2tagid[trackid] = set()

    for tag in jsondict['tags']:
        tagid = tag2id[tag[0]]
        trackid2tagid[trackid].add(tagid)
        if not tagid in tagid2trackid:
            tagid2trackid[tagid] = set()            
        tagid2trackid[tagid].add(trackid)        

**Example:** *Tag IDs of the first song*

In [214]:
trackid2tagid[0]

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}

**Example:**
- The first tag
- Song IDs that contain this tag
- Information about these songs

In [22]:
print(id2tag[0])
ss = list(tagid2trackid[0])
print(ss)
for el in ss:    
    print(track2info[id2track[el]])

Bay Area
[0, 4224, 303, 2739, 2646, 4824, 8474, 1150]
['Casual', "I Didn't Mean To"]
['Metallica', 'Metal Militia']
['Casual', 'OAKtown']
['Joe Satriani', 'Midnight']
['Dead Kennedys', 'Kill the Poor']
['Dead Kennedys', 'Halloween']
['Cake', 'Arco Arena']
['Del tha Funkee Homosapien', 'Mistadobalina']


Legend:
- $A$ is the Google matrix (the transition kernel)
- $G$ is the column-normalised adjacency matrix (surfer matrix)
- $\beta$ is the chance of not teleporting
- $e$ is the teleport vector (normalized)
- $E$ is the teleport matrix (simply concatenated $e$'s )


$$A = \beta G + (1 - \beta) E$$
$$Ap = \beta (Gp) + (1 - \beta) (Ep)$$
$$Ap = \beta (Gp) + (1 - \beta) e$$

**Note: **Do not forget to normalize $Ap$ each time before assigning as the "new" $p$ since there are dead-end (not pointing to any other node) nodes in the graph and their columns do not sum to 1.

# 6. Form $G$ (Surfer Matrix)

In [215]:
%%time
row_ind = []
col_ind = []
value = []

for jsondict in data: 
    col_trackid = track2id[jsondict['track_id']]    
    for similar in jsondict['similars']:
        row_trackid = track2id[similar[0]]
        col_ind.append(col_trackid)
        row_ind.append(row_trackid)
        value.append(1)

CPU times: user 81.1 ms, sys: 349 ms, total: 430 ms
Wall time: 573 ms


In [216]:
%%time
G = sp.csc_matrix((value, (row_ind, col_ind)), shape=(INDEX,INDEX), dtype=np.float64)

CPU times: user 13 ms, sys: 6.6 ms, total: 19.6 ms
Wall time: 22.5 ms


In [217]:
%%time
G = normalize(G, axis=0, norm='l1')

CPU times: user 2.03 ms, sys: 1.76 ms, total: 3.79 ms
Wall time: 3.37 ms


# 7. Form $e$ (Teleport Vector)
- Here set the Stag variable with desired genres

In [230]:
#Stag = []  # empty means all songs
Stag = ['rock']  # empty means all songs

S = set()
if(len(Stag) == 0):
    S = keysofdict(id2track)
else:    
    Stagid = lookup(tag2id, Stag)
    for tagid in Stagid:
        S = S.union(tagid2trackid[tagid])

S = list(S)
e = index2binary(S, INDEX)
e = normalize(e, axis=0, norm='l1')

print('Sum of teleport vector is',np.sum(e))


Sum of teleport vector is 1.0


# 9. Apply Topic Specific PageRank
- Here set the parameter $\beta$

In [231]:
beta = 0.8
p = np.ones([INDEX, 1])
P = p * (1/INDEX)

In [232]:
%%time
# Ideally repeat until convergence
for i in range(10):    
    term = beta * G.dot(p) + (1 - beta) * e
    p_new = normalize(term, axis=0, norm='l1')
    print(np.linalg.norm(p_new-p))
    p = p_new

96.5815758378
0.0192671311161
0.0130985971504
0.00903930843148
0.006833106789
0.00542903025697
0.00438528391022
0.00360293501918
0.00304371676222
0.00256402734721
CPU times: user 13 ms, sys: 3.47 ms, total: 16.5 ms
Wall time: 19.3 ms


In [233]:
print('What does the last Ap sum to?', np.sum(term))
print('After normalization', np.sum(p))

What does the last Ap sum to? 0.920075129131
After normalization 1.0


In [234]:
best = np.sort(p.T,1)  # contains ranking scores
best = np.fliplr(best)
best = best.T

besti = np.argsort(p.T,1) # contains ranking indices
besti = np.fliplr(besti)
besti = besti.T

In [235]:
besttrack = lookup(id2track, besti[:,0])
bestinfo = lookup(track2info, besttrack)
bestinfo = np.reshape(np.asarray(bestinfo), (INDEX, 2))
besttrack = np.reshape(np.asarray(besttrack), (INDEX,-1))
out = np.hstack([bestinfo, besttrack, besti, best])
pd.DataFrame(out).head(20)

Unnamed: 0,0,1,2,3,4
0,Y&T,Contagious,TRAWEDC128F14A2EBA,6034,0.0114794025529441
1,House of Lords,Rock Bottom,TRAENUU128F424F50C,1259,0.0105166545006728
2,Maroon 5,This Love,TRAZOSB128F9302A07,6964,0.0100618912239044
3,Radiohead,15 Step,TRARTEH128F423DBC1,4848,0.0094079227503656
4,Franz Ferdinand,Do You Want To,TRAZASM128F932FBEE,6816,0.0093321827013164
5,The White Stripes,Blue Orchid,TRBHLDQ128F423EF10,9066,0.009271863904613
6,3 Doors Down,Here Without You,TRBDWDQ128F1455787,8095,0.008020242547244
7,Death From Above 1979,Romantic Rights,TRADKAX128E0786B92,958,0.0076707829599138
8,Aerosmith,Crazy,TRBGPJP128E078ED20,8810,0.0073427322770147
9,Aerosmith,Crazy,TRAZDPO128E078ECE6,6852,0.0073427322770147


In [236]:
# Observation
lookup(track2info, lookup(id2track,tagid2trackid[tag2id['hiphop']]))

[['Run-D.M.C.', "Run's House"],
 ['G-Unit', "Poppin' Them Thangs"],
 ['Rihanna', 'SOS'],
 ['Ms. Dynamite', 'All I Ever'],
 ['Beastie Boys', "Eugene's Lament"],
 ['k-os', 'Man I Used To Be'],
 ['Eminem', 'The Way I Am'],
 ['The Game', 'Da Shit'],
 ['A Tribe Called Quest', 'Like It Like That'],
 ['K.I.Z.', 'Freiwild'],
 ['EPMD', "I'm Mad"],
 ['Samy Deluxe', 'Die Meisten'],
 ['Usher', "Hey Daddy (Daddy's Home)"],
 ['Krayzie Bone', 'Let It Burn'],
 ['Dendemann', 'Dendemänner braucht das Land'],
 ['Xzibit', 'Carry The Weight'],
 ['Massive Töne',
  'Notify The President Feat. Celestial Souljahz - Shabazz The Disciple & Freestyle'],
 ['The Sugarhill Gang', '8th Wonder'],
 ['Aesop Rock', 'Coma'],
 ['The Game', 'Scream On Em'],
 ['Dilated Peoples', 'World On Wheels'],
 ['G-Unit', 'Tony Yayo Explosion'],
 ['OutKast', "Happy Valentine's Day"],
 ['Eko Fresh', 'Hartz IV'],
 ['The Roots', '100% Dundee'],
 ['Kanye West', 'School Spirit Skit 2'],
 ['Dynamite Deluxe', 'Zapzap'],
 ['Eric B. & Rakim', 'F