In [1]:
import pandas as pd
import networkx as nx
import collections
import matplotlib.pyplot as plt
import matplotlib
from tqdm import tqdm
import os
import json
import sys
import sqlite3
import urllib.request, json 
import datetime
import numpy as np

In [2]:
# it is possible to run multiple copies of the notebook in parallel, you need to set split_nr a total_split_nr accordingly in each of them
# you need to have at least TOTAL_SPLIT_NR apikeys in apikey.txt (one per row, no header)
SPLIT_NR = 0
TOTAL_SPLIT_NR = 3

In [3]:
listenings = pd.read_csv('lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv',
                         sep='\t',error_bad_lines=False, header= None,
                         names=['userid','timestamp','artid','artname','traid','traname'])

In [4]:
# convert timestamp's datatype and use it as index
listenings['timestamp'] = pd.to_datetime(listenings['timestamp'])
listenings.index = listenings['timestamp']

In [5]:
listenings.head()

Unnamed: 0_level_0,userid,timestamp,artid,artname,traid,traname
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2009-05-04 23:08:57+00:00,user_000001,2009-05-04 23:08:57+00:00,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
2009-05-04 13:54:10+00:00,user_000001,2009-05-04 13:54:10+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2009-05-04 13:52:04+00:00,user_000001,2009-05-04 13:52:04+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
2009-05-04 13:42:52+00:00,user_000001,2009-05-04 13:42:52+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
2009-05-04 13:42:11+00:00,user_000001,2009-05-04 13:42:11+00:00,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)


In [6]:
listenings.tail()

Unnamed: 0_level_0,userid,timestamp,artid,artname,traid,traname
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-27 22:02:35+00:00,user_001000,2008-01-27 22:02:35+00:00,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me
2008-01-27 21:56:52+00:00,user_001000,2008-01-27 21:56:52+00:00,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off
2008-01-27 21:52:36+00:00,user_001000,2008-01-27 21:52:36+00:00,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds
2008-01-27 21:49:12+00:00,user_001000,2008-01-27 21:49:12+00:00,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky
2008-01-27 21:43:14+00:00,user_001000,2008-01-27 21:43:14+00:00,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3acc99bc-a349-420f-ad28-7095eb3533c9,Impossible Germany


## Querying of the last.fm API to get realease years of the songs 
The last.fm apikey needs to be stored in a apikey.txt file.

NOT USED:
* you would need to get the album of each song and get realease date of the albums then which would take well over 100 hours of constant querying
* not every song has an album so the data would not be perfect

In [7]:
apiKey = pd.read_csv('apikey.txt',header=None)[0][SPLIT_NR]

In [8]:
uniqueTracksFromListenings = listenings[['artname','traname']].drop_duplicates()

In [9]:
dfSplit = np.array_split(uniqueTracksFromListenings, TOTAL_SPLIT_NR)[SPLIT_NR]

In [10]:
dfSplit.dropna(inplace=True)

In [11]:
ablumIds={}

for artname,traname in tqdm(zip(dfSplit['artname'],
                                dfSplit['traname'])
                            ,total=len(dfSplit),position=0, leave=True):
    artnameEnc = urllib.parse.quote_plus(artname)
    tranameEnc = urllib.parse.quote_plus(traname)

    try:
        with urllib.request.urlopen(f"http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={apiKey}&artist={artnameEnc}&track={tranameEnc}&format=json") as url:
            data = json.loads(url.read().decode())
            ablumIds[artname+traname] = data['track']['album']['mbid']
    except:
        pass


100%|██████████| 499573/499573 [27:55:48<00:00,  4.97it/s]    


In [12]:
len(ablumIds)

330957

In [13]:
len(set(ablumIds.values()))

90244

In [14]:
results = pd.DataFrame()
results['art_tra_name']=ablumIds.keys()
results['album_id']=ablumIds.values()
results.to_csv(f'api_collected_data/ablumIdPart{SPLIT_NR}.csv',index=False)