In [1]:
import pandas as pd
import numpy as np
from scipy import sparse

In [2]:
playscols = ['usersha1', 'mbid', 'artistname', 'plays']
playsdf = pd.read_csv('usersha1-artmbid-artname-plays.tsv', sep='\t', names=playscols, index_col=False)
profilecols = ['usersha1', 'gender', 'age', 'country', 'registration']
profiledf = pd.read_csv('usersha1-profile.tsv', sep='\t', names=profilecols, index_col=False)
df = playsdf.merge(profiledf, on=['usersha1'], how='left')

# Clean the data by removing artists without mbid (usually nonsense)
# and profiles without registration (also usually nonsense, maybe dataset artifact)
# Also delete entries with unknown artists, and artists that only show up once
df = df[df['mbid'].notnull() & df['registration'].notnull() 
        & ~(df['artistname'] == '[unknown]') & (df.duplicated(subset='artistname', keep=False))]

In [3]:
display(df.head(10))
display("Unique Artists: {}".format(len(df['artistname'].unique())))
display("Unique Users: {}".format(len(df['usersha1'].unique())))
display("Entries: {}".format(len(df)))

Unnamed: 0,usersha1,mbid,artistname,plays,gender,age,country,registration
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137,f,22.0,Germany,"Feb 1, 2007"
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,f,22.0,Germany,"Feb 1, 2007"
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,f,22.0,Germany,"Feb 1, 2007"
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,f,22.0,Germany,"Feb 1, 2007"
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,f,22.0,Germany,"Feb 1, 2007"
5,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691,f,22.0,Germany,"Feb 1, 2007"
6,00000c289a1829a808ac09c00daf10bc3c4e223b,6531c8b1-76ea-4141-b270-eb1ac5b41375,magica,545,f,22.0,Germany,"Feb 1, 2007"
7,00000c289a1829a808ac09c00daf10bc3c4e223b,21f3573f-10cf-44b3-aeaa-26cccd8448b5,the black dahlia murder,507,f,22.0,Germany,"Feb 1, 2007"
8,00000c289a1829a808ac09c00daf10bc3c4e223b,c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs,424,f,22.0,Germany,"Feb 1, 2007"
9,00000c289a1829a808ac09c00daf10bc3c4e223b,0639533a-0402-40ba-b6e0-18b067198b73,lunachicks,403,f,22.0,Germany,"Feb 1, 2007"


'Unique Artists: 133086'

'Unique Users: 358854'

'Entries: 17238522'

In [4]:
uniqueartists = df['artistname'].value_counts()
uniqueartists.to_csv('uniqueartists.csv')

  


In [41]:
topartists = df['artistname'].value_counts().head(10000)
subsetdf = df[df['artistname'].isin(topartists.index)]
subsetdf.head(10)

Unnamed: 0,usersha1,mbid,artistname,plays,gender,age,country,registration
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099,f,22.0,Germany,"Feb 1, 2007"
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897,f,22.0,Germany,"Feb 1, 2007"
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717,f,22.0,Germany,"Feb 1, 2007"
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706,f,22.0,Germany,"Feb 1, 2007"
5,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691,f,22.0,Germany,"Feb 1, 2007"
7,00000c289a1829a808ac09c00daf10bc3c4e223b,21f3573f-10cf-44b3-aeaa-26cccd8448b5,the black dahlia murder,507,f,22.0,Germany,"Feb 1, 2007"
8,00000c289a1829a808ac09c00daf10bc3c4e223b,c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs,424,f,22.0,Germany,"Feb 1, 2007"
9,00000c289a1829a808ac09c00daf10bc3c4e223b,0639533a-0402-40ba-b6e0-18b067198b73,lunachicks,403,f,22.0,Germany,"Feb 1, 2007"
10,00000c289a1829a808ac09c00daf10bc3c4e223b,a342964d-ca53-4e54-96dc-e8501851e77f,walls of jericho,393,f,22.0,Germany,"Feb 1, 2007"
11,00000c289a1829a808ac09c00daf10bc3c4e223b,f779ed95-66c8-4493-9f46-3967eba785a8,letzte instanz,387,f,22.0,Germany,"Feb 1, 2007"


In [None]:
'''
dummydf = subsetdf.pivot_table(index='usersha1', columns='artistname', aggfunc='size', fill_value=0)
dummy_sparse = sparse.csr_matrix(dummydf.head(100))
dummy_sparse.dot(dummy_sparse.T)
'''

In [11]:
'''
counts = subsetdf['artistname'].value_counts()
dd = pd.merge(subsetdf.head(100000), subsetdf.head(100000), on='usersha1')
crosstab = pd.crosstab(dd['artistname_x'], dd['artistname_y'])
ct_div = crosstab.div(counts)
'''

In [42]:
from tqdm import tqdm_notebook 

crosstab = pd.DataFrame(np.zeros((len(topartists), len(topartists))), columns=topartists.index, index=topartists.index)

for g, chunkdf in tqdm_notebook(subsetdf.groupby(np.arange(len(subsetdf)) // 100000)):
    counts = chunkdf['artistname'].value_counts()
    dd = pd.merge(chunkdf, chunkdf, on='usersha1')
    crosstab_tmp = pd.crosstab(dd['artistname_x'], dd['artistname_y'])
    crosstab = crosstab.add(crosstab_tmp, fill_value=0)
crosstab

HBox(children=(IntProgress(value=0, max=145), HTML(value='')))

Unnamed: 0,!!!,#####,(+44),(hed) planet earth,*nsync,*shels,+/-,+44,...and oceans,...and you will know us by the trail of dead,...,菅野よう子,近藤浩治,雅-miyavi-,高木正勝,鷺巣詩郎,동방신기,비,소녀시대,신화,이효리
!!!,2408.0,1.0,0.0,3.0,4.0,2.0,15.0,5.0,0.0,109.0,...,7.0,4.0,1.0,6.0,3.0,2.0,0.0,1.0,0.0,0.0
#####,1.0,652.0,5.0,31.0,0.0,0.0,0.0,14.0,0.0,3.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
(+44),0.0,5.0,255.0,4.0,2.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(hed) planet earth,3.0,31.0,4.0,1463.0,4.0,3.0,1.0,15.0,2.0,14.0,...,4.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
*nsync,4.0,0.0,2.0,4.0,2024.0,0.0,2.0,24.0,0.0,1.0,...,8.0,5.0,7.0,0.0,2.0,46.0,16.0,18.0,15.0,13.0
*shels,2.0,0.0,0.0,3.0,0.0,266.0,0.0,1.0,0.0,18.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+/-,15.0,0.0,0.0,1.0,2.0,0.0,324.0,1.0,0.0,23.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
+44,5.0,14.0,0.0,15.0,24.0,1.0,1.0,2644.0,0.0,22.0,...,7.0,7.0,6.0,0.0,1.0,3.0,0.0,1.0,0.0,1.0
...and oceans,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,353.0,2.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...and you will know us by the trail of dead,109.0,3.0,3.0,14.0,1.0,18.0,23.0,22.0,2.0,3695.0,...,13.0,4.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0


In [44]:
crosstab.to_hdf('crosstab.hd5', key='artists')

In [42]:
def crosscorr(first, second, df):
    first_listeners = df[df['artistname'] == first]['usersha1']
    first_count = len(first_listeners)
    second_listeners = df[(df['artistname'] == second) & (df['usersha1'].isin(first_listeners))]
    second_count = len(second_listeners)
    return second_count / first_count
    
crosscorr('the beatles', 'radiohead', df)

0.379041248606466

## Get Tracks for Artists

In [5]:
import sys
from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
from multiprocessing import Pool
from get_track_urls import get_track_urls
from tqdm import tnrange, tqdm_notebook

client_id = "3eeaa57656bf4872bb9085c468a8af2a"
client_secret = "ee78dfa91ae7450199b6a5d363008fa4"

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

artists = subsetdf['artistname'].unique()
    
p = Pool(5)
r = list(tqdm_notebook(p.imap(get_track_urls, artists), total=len(artists)))

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [6]:
import itunes
import time

for entry in tqdm_notebook(r):
    artistname = entry['artistname']
    urls = entry['urls']
    if len(urls) == 0:
        try:
            tracks = itunes.search_track(query=artistname)
            itunesurls = [track.json['previewUrl'] for track in tracks[0:3]]
            entry['urls'] = itunesurls
            time.sleep(5)
        except Exception as e:
            print(artistname)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

godsmack



In [7]:
for entry in tqdm_notebook(r):
    artistname = entry['artistname']
    urls = entry['urls']
    if artistname == "the beatles":
        print(urls)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

['https://audio-ssl.itunes.apple.com/apple-assets-us-std-000001/AudioPreview71/v4/46/48/7d/46487d90-d40c-7c47-7285-5edbfd0fd2c0/mzaf_5516723347634890825.plus.aac.p.m4a', 'https://audio-ssl.itunes.apple.com/apple-assets-us-std-000001/Music/v4/d5/c8/10/d5c81035-a242-c354-45cf-f634e4127f43/mzaf_1171292596660883824.plus.aac.p.m4a', 'https://audio-ssl.itunes.apple.com/apple-assets-us-std-000001/Music4/v4/a0/05/df/a005df47-d4d5-1fd1-eefc-77553fa59689/mzaf_5617395189778548804.plus.aac.p.m4a']



In [8]:
import json
f = open("tracks.json","w+")
json.dump(r, f)
f.close()

In [7]:
import json
f = open("tracks.json","r")
r = json.load(f)
f.close()

In [10]:
alltracks = []

for artist in r:
    if len(artist['urls']) < 3:
        print(artist['artistname'], len(artist['urls']))
    for track in artist['urls']:
        if track is not None:
            alltracks.append((artist['artistname'], track))

le tigre 1
simian mobile disco 1
patrick wolf 1
audioslave 2
children of bodom 1
dragonforce 1
devendra banhart 2
max richter 1
elliott smith 2
tv on the radio 1
the cinematic orchestra 2
beck 1
the decemberists 1
michael jackson 2
volbeat 1
scorpions 1
gentleman 2
slayer 1
bullet for my valentine 1
norther 2
stratovarius 2
leona lewis 2
a-ha 2
colbie caillat 1
jennifer lopez 2
angels & airwaves 1
fatboy slim 2
mando diao 2
dashboard confessional 2
taking back sunday 1
andrew bird 2
common 1
jay-z 1
2pac 2
talib kweli 2
bill evans 1
john coltrane 1
the game 1
groove coverage 2
eels 1
john legend 2
frank sinatra 1
akon 1
black rebel motorcycle club 2
the velvet underground 1
saosin 1
buckethead 2
aerosmith 2
a perfect circle 1
interpol 2
hatebreed 2
gamma ray 2
enter shikari 2
danny elfman 1
new found glory 1
less than jake 1
the specials 2
thursday 1
weezer 1
feist 1
rufus wainwright 1
johnny cash 2
nina simone 1
marvin gaye 2
eric clapton 1
billie holiday 1
thelonious monk 2
belle and

## Download tracks

In [4]:
from download_tracks import download_tracks
from multiprocessing import Pool
from tqdm import tnrange, tqdm_notebook

p = Pool(25)
r = list(tqdm_notebook(p.imap(download_tracks, alltracks), total=len(alltracks)))

HBox(children=(IntProgress(value=0, max=2783), HTML(value='')))

In [3]:
from make_spec import make_spectrogram
import glob
from multiprocessing import Pool
from tqdm import tnrange, tqdm_notebook

files = [filename for filename in glob.iglob('tracks/**/*.mp3', recursive=True)]
p = Pool(3)
tqdm_notebook(p.imap(make_spectrogram, files), total=len(files))

HBox(children=(IntProgress(value=0, max=2777), HTML(value='')))

0/|/  0%|| 0/2777 [00:00<?, ?it/s]