In [118]:
!mamba install -y pandarallel


                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (0.17.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: ['pandarallel']

conda-forge/noarch       [<=>                 ] (00m:00s) 
conda-forge/noarch     

In [45]:
import requests
import pandas as pd
import json
import numpy as np
from bs4 import BeautifulSoup
from tqdm.auto import tqdm
from time import sleep
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True, nb_workers=16)
tqdm.pandas()

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [52]:
BASE_URL = 'https://musicbrainz.org'
SEARCH_URL = f'{BASE_URL}/search'

YEAR = 2000
RELEASES_QUERY = f'firstreleasedate:{YEAR} AND type:(-Compilation -DJ-mix -Soundtrack -Remix -Live -Spokenword)'
ERROR = 'We could not fetch the document from the search server'

In [66]:
def get_rows(page):
    for i in range(5):
        params = {
            'query': RELEASES_QUERY,
            'type': 'release_group',
            'limit': '100',
            'method': 'advanced',
            'page': page,
        }
        resp = requests.get(SEARCH_URL, params)
        rows = BeautifulSoup(resp.text).find('tbody')
        
        if not rows:
            if ERROR in resp.text:
                print(f'error, retrying ({i})')
                sleep(1)
                continue

            return None

        return rows.find_all('tr')
    

In [63]:
def row2release(row):
    release, artist, rtype = row.find_all('td')
    return {
        'release_type': rtype.text,
        'release_name': release.text,
        'release_link': release.find('a')['href'],
        'artist_name': artist.text,
        'artist_link': artist.find('a')['href'],
    }

In [60]:
pages = {}

In [65]:
for page in tqdm(range(1, 100000)):
    if page in pages:
        continue

    rows = get_rows(page)
    if not rows:
        break

    pages[page] = [row2release(r) for r in rows]

  0%|          | 0/99999 [00:00<?, ?it/s]

error, retrying (0)
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1" name="viewport"/><link href="/static/images/favicons/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/><link href="/static/images/favicons/apple-touch-icon-60x60.png" rel="apple-touch-icon" sizes="60x60"/><link href="/static/images/favicons/apple-touch-icon-72x72.png" rel="apple-touch-icon" sizes="72x72"/><link href="/static/images/favicons/apple-touch-icon-76x76.png" rel="apple-touch-icon" sizes="76x76"/><link href="/static/images/favicons/apple-touch-icon-114x114.png" rel="apple-touch-icon" sizes="114x114"/><link href="/static/images/favicons/apple-touch-icon-120x120.png" rel="apple-touch-icon" sizes="120x120"/><link href="/static/images/favicons/apple-touch-icon-144x144.png" rel="apple-touch-icon" sizes="144x144"/><link href="/static/images/favicons/apple-touch-icon-152x152.png" rel="

In [67]:
df = pd.concat(pd.DataFrame(p) for p in pages.values())
df.head()

Unnamed: 0,release_type,release_name,release_link,artist_name,artist_link
0,Album,Lenka Dusilová,/release-group/2a23335a-5590-30c3-b25c-884cabe...,Lenka Dusilová,/artist/4d60cb9c-81e2-4ae2-9976-b640b545a303
1,Album,Ça parle au diable,/release-group/71058210-8408-3aed-8878-3ad2e4a...,Mes Aïeux,/artist/a320d461-a689-4946-a686-70a6eaebdffb
2,Album,Afterlife Kingdom,/release-group/3d648595-b55e-3968-a333-390d257...,Satanic Slaughter,/artist/1fd06f01-fb68-43ee-a392-af7336250df1
3,Album,Call Me Al!,/release-group/53dd9a96-e8ea-3200-abee-8911b55...,Alain Caron,/artist/f95594fd-8ba6-4540-8854-7c149153a891
4,Album,Nowhere Near Here,/release-group/a8ce8cf1-3def-30af-b387-65acd97...,Alex Woodard,/artist/265b1f16-f15c-484f-a6b7-2be20dcf6a9e


In [68]:
len(df)

33541

In [69]:
df.to_pickle(f'{YEAR}/releases.pickle')

In [110]:
def get_links(artist_link):
    for i in range(5):
        resp = requests.get(f'{BASE_URL}{artist_link}')
        links = BeautifulSoup(resp.text).find(class_='external_links')
        if not links:
            if resp.status_code == 200:
                return {}
            
            print(f'{BASE_URL}{artist_link}')
            print(f'error, retrying ({i})')
            sleep(1)
            continue

        links = {
            li['class'][0].replace('-favicon', ''): li.find('a')['href']
            for li in links.find_all('li')[:-1]
        }
        return links


In [122]:
df['links'] = df['artist_link'].parallel_apply(get_links)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1678), Label(value='0 / 1678'))), …

In [133]:
df = pd.concat([
    df.drop(['links'], axis=1).reset_index(drop=True),
    pd.json_normalize(df['links']).reset_index(drop=True)
], axis=1)

In [134]:
df.to_pickle(f'{YEAR}/releases.pickle')

In [139]:
list(df.columns)

['release_type',
 'release_name',
 'release_link',
 'artist_name',
 'artist_link',
 'discogs',
 'wikipedia',
 'viaf',
 'wikidata',
 'home',
 'itunes',
 'deezer',
 'spotify',
 'allmusic',
 'metalarchives',
 'songkick',
 'bnfcatalogue',
 'dnb',
 'instagram',
 'youtube',
 'facebook',
 'twitter',
 'bandsintown',
 'amazon',
 'genius',
 'imdb',
 'lastfm',
 'rateyourmusic',
 'reverbnation',
 'secondhandsongs',
 'setlistfm',
 'whosampled',
 'worldcat',
 'loc',
 'myspace',
 'generasia',
 'vgmdb',
 'musiksammler',
 'applemusic',
 'bandcamp',
 'vimeo',
 'amazonmusic',
 'mora',
 'no',
 'utanet',
 'encyclopedisque',
 'youtubemusic',
 'weibo',
 'snac',
 'imslp',
 'openlibrary',
 'trove',
 'ibdb',
 'classicalarchives',
 'soundcloud',
 'muzikum',
 'blog',
 'rockcomar',
 'progarchives',
 'fortyfiveworlds',
 'animenewsnetwork',
 'muziekweb',
 'cinii',
 'musicapopularcl',
 'anisongeneration',
 'fortyfivecat',
 'imvdb',
 'tidal',
 'baidu',
 'ocremix',
 'musixmatch',
 'napster',
 'qobuz',
 'spiritofrock',


In [147]:
pd.set_option('display.max_columns', 130)
df.describe()

Unnamed: 0,release_type,release_name,release_link,artist_name,artist_link,discogs,wikipedia,viaf,wikidata,home,itunes,deezer,spotify,allmusic,metalarchives,songkick,bnfcatalogue,dnb,instagram,youtube,facebook,twitter,bandsintown,amazon,genius,imdb,lastfm,rateyourmusic,reverbnation,secondhandsongs,setlistfm,whosampled,worldcat,loc,myspace,generasia,vgmdb,musiksammler,applemusic,bandcamp,vimeo,amazonmusic,mora,no,utanet,encyclopedisque,youtubemusic,weibo,snac,imslp,openlibrary,trove,ibdb,classicalarchives,soundcloud,muzikum,blog,rockcomar,progarchives,fortyfiveworlds,animenewsnetwork,muziekweb,cinii,musicapopularcl,anisongeneration,fortyfivecat,imvdb,tidal,baidu,ocremix,musixmatch,napster,qobuz,spiritofrock,ircam,quebecinfomusique,beatport,psydb,cdjapan,utaitedb,recochoku,residentadvisor,overture,mixcloud,junodownload,livefans,bookbrainz,lieder,utamap,vk,twitch,pinterest,spiritofmetal,finnmusic,rolldabeats,vkdb,dahr,jlyric,tower,dailymotion,archive,jazzmusicarchives,tiktok,cpdl,linkedin,utaten,directlyrics,bigcartel,patreon,vocadb,ric,dogmazic,videogamin,castalbums,dhhu,operabase,mainlynorfolk,thesession,piosenki,wikisource,songfacts,traxsource,rockipedia,kickstarter,gutenberg,theatricalia
count,33541,33541,33541,33541,33541,29519,4841,9890,19754,14767,5244,6745,9634,12567,1692,6491,5326,5873,2595,4577,7934,5192,1242,1359,2321,5960,7997,11735,305,2789,2214,1714,6380,5755,7843,322,1457,2857,1243,2046,126,1256,100,2601,31,86,97,57,2218,1343,1595,1706,1065,285,2600,2573,732,72,198,169,242,583,69,14,180,791,1263,1481,13,24,510,444,150,210,16,36,581,23,180,24,59,239,251,40,399,236,30,15,38,272,39,54,345,24,181,51,8,42,29,35,36,42,27,2,26,7,98,15,13,2,1,5,3,33,1,5,1,3,1,6,2,7,9,2,4,2
unique,27,24057,24970,20627,19614,16774,2851,4833,10241,8293,2187,2654,4085,6330,1111,3000,2369,2611,1250,2247,4165,2583,542,308,890,2551,3364,5876,183,1005,941,547,2881,2608,4349,116,268,1161,558,1144,61,190,31,756,9,43,52,27,723,261,347,393,177,41,1184,1103,328,54,103,49,44,168,18,11,64,318,465,550,8,15,178,155,59,79,9,18,234,13,68,10,18,100,1,18,141,80,13,3,11,121,9,18,145,11,68,23,2,14,12,13,19,24,9,1,19,2,25,6,9,1,1,3,2,8,1,4,1,3,1,2,1,4,4,1,2,1
top,Album,[untitled],/release-group/3ef443d3-0e58-4bdc-9804-e7c6b97...,Various Artists,/artist/89ad4ac3-39f7-470e-963a-56509c546377,https://www.discogs.com/artist/194,//en.wikipedia.org/wiki/Ludwig_van_Beethoven,https://viaf.org/viaf/12304462/,//www.wikidata.org/wiki/Q3108914,http://www.tonyoconnor.com.au/,https://itunes.apple.com/us/artist/id11862,https://www.deezer.com/artist/5080,https://open.spotify.com/artist/0LyfQWJT6nXafL...,https://www.allmusic.com/artist/mn0000075140,https://www.metal-archives.com/bands/Nightwish/39,https://www.songkick.com/artists/556956,https://catalogue.bnf.fr/ark:/12148/cb118897907,http://d-nb.info/gnd/11850553X,https://www.instagram.com/speedguru666/,//www.youtube.com/channel/UCB4ceM1ygPEMWu3_-2-...,https://www.facebook.com/pages/Beethoven/11066...,https://twitter.com/limpbizkit,https://www.bandsintown.com/a/292,//www.amazon.com/-/e/B0017PCMX0?tag=musicbrain...,https://genius.com/artists/Johann-sebastian-bach,https://www.imdb.com/name/nm0001925/,https://www.last.fm/music/Various+Artists,https://rateyourmusic.com/artist/johann_sebast...,https://www.reverbnation.com/delasoul,https://secondhandsongs.com/artist/4192,https://www.setlist.fm/setlists/johann-sebasti...,https://www.whosampled.com/Johann-Sebastian-Bach/,https://www.worldcat.org/identities/lccn-n7902...,https://id.loc.gov/authorities/names/n79021425,https://myspace.com/limpbizkit,https://www.generasia.com/wiki/Himuro_Kyosuke,https://vgmdb.net/artist/1718,https://www.musik-sammler.de/artist/johann-seb...,https://music.apple.com/us/artist/12374,//kawabata-makoto.bandcamp.com/,https://vimeo.com/thedandywarhols,https://music.amazon.com/artists/B000QJO93Y,https://mora.jp/artist/11490/,https://soundtrackcollector.com/composer/4977/,https://www.uta-net.com/artist/1822/,http://www.encyclopedisque.fr/artiste/2361.html,https://music.youtube.com/channel/UCaqljTNgQHS...,https://www.weibo.com/linkinparkofficial,http://snaccooperative.org/ark:/99166/w63x84mt,"https://imslp.org/wiki/Category:Bach,_Johann_S...",https://openlibrary.org/works/OL443516A,https://nla.gov.au/nla.party-789063,https://ibdb.com/person.php?id=10027,https://www.classicalarchives.com/composer/330...,https://soundcloud.com/makoto-kawabata-1,http://muzikum.eu/en/122-6434/ludwig-van-beeth...,http://speedguru.blog45.fc2.com/,http://rock.com.ar/artistas/los-violadores,https://www.progarchives.com/artist.asp?id=2407,http://www.45worlds.com/classical/composer/bee...,https://www.animenewsnetwork.com/encyclopedia/...,https://www.muziekweb.nl/Link/M00000235994/CLA...,https://ci.nii.ac.jp/author/DA02483346?l=en,http://www.musicapopular.cl/artista/francesca-...,http://anison.info/data/person/35211.html,http://www.45cat.com/artist/limp-bizkit,https://imvdb.com/n/limp-bizkit,//tidal.com/artist/3901460,https://baike.baidu.com/item/John%20zorn,https://ocremix.org/artist/10319/the-cynic-pro...,https://www.musixmatch.com/artist/Prince,https://us.napster.com/artist/the-beatles,https://www.qobuz.com/gb-en/interpreter/the-be...,https://www.spirit-of-rock.com/en/band/Prince,http://brahms.ircam.fr/krzysztof-penderecki,http://www.qim.com/artistes/biographie.asp?art...,https://www.beatport.com/artist/prince/73994,http://www.psydb.net/artists/i/infected-mushro...,https://www.cdjapan.co.jp/person/96621,https://utaitedb.net/Ar/20957,https://recochoku.jp/artist/2000000011/,https://ra.co/dj/lusine,https://overture.doremus.org/artist/269cec9d-5...,http://www.mixcloud.com/soniqueclarke,https://www.junodownload.com/artists/Prince/re...,https://www.livefans.jp/artists/2099,https://bookbrainz.org/author/e53b85da-6147-46...,http://www.lieder.net/lieder/get_settings.html...,http://www.utamap.com/searchartist.php?artisti...,https://vk.com/limpbizkit,https://www.twitch.tv/deftonesofficial,https://www.pinterest.com/xtina866/,http://www.spirit-of-metal.com/groupe-groupe-L...,http://www.finnmusic.net/main.php?61505e5c0155...,http://www.rolldabeats.com/artist/digital,https://www.vkdb.jp/LUNA+SEA.html,http://adp.library.ucsb.edu/index.php/talent/d...,http://j-lyric.net/artist/a00055b/,https://tower.jp/artist/365201,https://www.dailymotion.com/christinaaguilera,//archive.org/details/RyanAdams,https://www.jazzmusicarchives.com/artist/enric...,https://tiktok.com/@moby,http://cpdl.org/wiki/index.php/Charles_Hubert_...,https://de.linkedin.com/in/ameliacuni,https://utaten.com/artist/%E5%80%89%E6%A9%8B%E...,http://www.directlyrics.com/christina-aguilera...,https://suicidecommando.bigcartel.com/,https://www.patreon.com/montyharper,https://vocadb.net/Ar/5841,http://www.rockinchina.com/w/New_Pants,http://play.dogmazic.net/index.php#artists.php...,http://videogam.in/people/Yasunori_Mitsuda,http://castalbums.org/people/The-Beatles/118334,http://dhhu.dk/wiki/index.php?title=Kongehuset,https://operabase.com/artists/1890,https://mainlynorfolk.info/folk/records/blazin...,https://thesession.org/recordings/artists/1381,https://bibliotekapiosenki.pl/zespoly/Cracow_K...,https://en.wikisource.org/wiki/Author:Billy_Bragg,http://www.songfacts.com/facts-suicidal_tenden...,https://www.traxsource.com/artist/36837,https://www.rockipedia.no/artister/a-ha-15645/,https://www.kickstarter.com/profile/rockyvotolato,"http://www.gutenberg.org/author/Ian,+Janis",http://theatricalia.com/person/26hm
freq,21526,23,4,731,731,731,60,251,731,22,251,731,731,251,9,251,251,251,15,15,60,15,15,251,251,251,731,251,7,251,251,251,251,251,15,12,251,251,65,15,8,731,7,251,7,6,6,9,251,251,251,251,251,39,15,60,15,3,12,60,65,60,20,2,10,15,15,19,5,3,12,12,12,12,5,5,12,7,9,5,7,7,251,5,12,11,8,9,7,15,7,10,15,4,12,5,6,5,6,10,4,4,6,2,4,4,10,3,2,2,1,2,2,12,1,2,1,1,1,3,2,2,5,2,2,2


In [157]:
df[(df.spotify.notna()|df.youtube.notna()|df.bandcamp.notna()|df.lastfm.notna()|df.wikipedia.notna())]

Unnamed: 0,release_type,release_name,release_link,artist_name,artist_link,discogs,wikipedia,viaf,wikidata,home,itunes,deezer,spotify,allmusic,metalarchives,songkick,bnfcatalogue,dnb,instagram,youtube,facebook,twitter,bandsintown,amazon,genius,imdb,lastfm,rateyourmusic,reverbnation,secondhandsongs,setlistfm,whosampled,worldcat,loc,myspace,generasia,vgmdb,musiksammler,applemusic,bandcamp,vimeo,amazonmusic,mora,no,utanet,encyclopedisque,youtubemusic,weibo,snac,imslp,openlibrary,trove,ibdb,classicalarchives,soundcloud,muzikum,blog,rockcomar,progarchives,fortyfiveworlds,animenewsnetwork,muziekweb,cinii,musicapopularcl,anisongeneration,fortyfivecat,imvdb,tidal,baidu,ocremix,musixmatch,napster,qobuz,spiritofrock,ircam,quebecinfomusique,beatport,psydb,cdjapan,utaitedb,recochoku,residentadvisor,overture,mixcloud,junodownload,livefans,bookbrainz,lieder,utamap,vk,twitch,pinterest,spiritofmetal,finnmusic,rolldabeats,vkdb,dahr,jlyric,tower,dailymotion,archive,jazzmusicarchives,tiktok,cpdl,linkedin,utaten,directlyrics,bigcartel,patreon,vocadb,ric,dogmazic,videogamin,castalbums,dhhu,operabase,mainlynorfolk,thesession,piosenki,wikisource,songfacts,traxsource,rockipedia,kickstarter,gutenberg,theatricalia
0,Album,Lenka Dusilová,/release-group/2a23335a-5590-30c3-b25c-884cabe...,Lenka Dusilová,/artist/4d60cb9c-81e2-4ae2-9976-b640b545a303,https://www.discogs.com/artist/557781,//en.wikipedia.org/wiki/Lenka_Dusilov%C3%A1,https://viaf.org/viaf/85482923/,//www.wikidata.org/wiki/Q3490011,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Album,Ça parle au diable,/release-group/71058210-8408-3aed-8878-3ad2e4a...,Mes Aïeux,/artist/a320d461-a689-4946-a686-70a6eaebdffb,https://www.discogs.com/artist/2209784,//en.wikipedia.org/wiki/Mes_A%C3%AFeux,,//www.wikidata.org/wiki/Q2567516,http://mesaieux.qc.ca/,https://itunes.apple.com/ca/artist/id139482404,https://www.deezer.com/artist/13597,https://open.spotify.com/artist/2heZLxgJjmZjVP...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Album,Call Me Al!,/release-group/53dd9a96-e8ea-3200-abee-8911b55...,Alain Caron,/artist/f95594fd-8ba6-4540-8854-7c149153a891,https://www.discogs.com/artist/267096,//en.wikipedia.org/wiki/Alain_Caron_(bassist),https://viaf.org/viaf/37110944/,//www.wikidata.org/wiki/Q714021,https://www.alaincaronofficial.com/,,,,,,https://www.songkick.com/artists/172864,https://catalogue.bnf.fr/ark:/12148/cb13986530j,http://d-nb.info/gnd/134861655,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Album,Nowhere Near Here,/release-group/a8ce8cf1-3def-30af-b387-65acd97...,Alex Woodard,/artist/265b1f16-f15c-484f-a6b7-2be20dcf6a9e,https://www.discogs.com/artist/732422,,,,,,,,,,,,,https://www.instagram.com/thealexwoodard/,//www.youtube.com/user/MoreForTheSender,https://www.facebook.com/thealexwoodard,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,Album,Paivepo,/release-group/0e4b08cd-ae95-3922-802f-c49fa8d...,Oliver Mtukudzi,/artist/566fb0fe-cf3a-48d1-a4ea-e9e6aaf7236e,https://www.discogs.com/artist/680928,//en.wikipedia.org/wiki/Oliver_Mtukudzi,https://viaf.org/viaf/261832847/,//www.wikidata.org/wiki/Q983221,,https://itunes.apple.com/artist/id56551652,,https://open.spotify.com/artist/0HC2dfJHpORLT2...,https://www.allmusic.com/artist/mn0000471887,,https://www.songkick.com/artists/27496-oliver-...,,http://d-nb.info/gnd/134736583,,//www.youtube.com/user/OliverMtukudzi,,https://twitter.com/tukuofficial,https://www.bandsintown.com/a/60617,//www.amazon.fr/-/e/B000APG3BU?tag=music083d-21,https://genius.com/artists/Oliver-mtukudzi,https://www.imdb.com/name/nm0616186/,https://www.last.fm/music/Oliver+Mtukudzi,https://rateyourmusic.com/artist/oliver-mtukudzi,https://www.reverbnation.com/artist_3936035,https://secondhandsongs.com/artist/14883,https://www.setlist.fm/setlists/oliver-mtukudz...,https://www.whosampled.com/Oliver-Mtukudzi/,https://www.worldcat.org/identities/lccn-no000...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33536,Single,Stiff Upper Lip,/release-group/5259dd42-3636-31c5-9311-d011b25...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,https://itunes.apple.com/us/artist/id5040714,https://www.deezer.com/artist/115,https://open.spotify.com/artist/711MCceyCBcFnz...,https://www.allmusic.com/artist/mn0000574772,,https://www.songkick.com/artists/276130-acdc,https://catalogue.bnf.fr/ark:/12148/cb13901479r,http://d-nb.info/gnd/1221288-X,https://www.instagram.com/acdc/,//www.youtube.com/user/acdcVEVO,https://www.facebook.com/acdc,https://twitter.com/AC_DC,https://www.bandsintown.com/a/26647,,https://genius.com/artists/Ac-dc,https://www.imdb.com/name/nm0009540/,https://www.last.fm/music/AC%2FDC,https://rateyourmusic.com/artist/ac_dc,,https://secondhandsongs.com/artist/1516,https://www.setlist.fm/setlists/acdc-23d6807b....,https://www.whosampled.com/ACDC/,https://www.worldcat.org/identities/lccn-n7801...,https://id.loc.gov/authorities/names/n78011846,https://myspace.com/acdc,,,https://www.musik-sammler.de/artist/ac-dc/,https://music.apple.com/us/artist/5040714,,,https://music.amazon.com/artists/B00136B4M8,,http://musicmoz.org/Bands_and_Artists/A/AC-DC/,,,,,,,,https://nla.gov.au/nla.party-783697,,,https://soundcloud.com/acdcofficial,,,,,,,,,,,http://www.45cat.com/artist/acdc,https://imvdb.com/n/acdc,//listen.tidal.com/artist/945,,,https://www.musixmatch.com/artist/AC-DC,https://us.napster.com/artist/acdc,,,,,,,,,,,,,,https://www.livefans.jp/artists/21822,,,,,,,http://www.spirit-of-metal.com/groupe-groupe-A...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33537,Album,California Dreamin,/release-group/ed938e59-3c6c-4d64-8df2-55984bd...,The Mamas & the Papas,/artist/ff294730-0315-440d-a543-54005779c15b,https://www.discogs.com/artist/230387,,https://viaf.org/viaf/229179345/,//www.wikidata.org/wiki/Q211277,,,https://www.deezer.com/artist/539,https://open.spotify.com/artist/1bs7HoMkSyQwco...,https://www.allmusic.com/artist/mn0000059293,,https://www.songkick.com/artists/230617,https://catalogue.bnf.fr/ark:/12148/cb139048140,http://d-nb.info/gnd/5534784-8,,,,,,,,https://www.imdb.com/name/nm2135108/,,https://rateyourmusic.com/artist/the_mamas_and...,,,,,https://www.worldcat.org/identities/lccn-n8521...,https://id.loc.gov/authorities/names/n85213584,,,,https://www.musik-sammler.de/artist/the-mamas-...,,,,,,http://musicmoz.org/Bands_and_Artists/M/Mamas_...,,,,,,,,https://nla.gov.au/nla.party-1016214,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33538,EP,Stiff Upper Lip,/release-group/61f736e8-b7dd-48c6-b164-1d2f91c...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,https://itunes.apple.com/us/artist/id5040714,https://www.deezer.com/artist/115,https://open.spotify.com/artist/711MCceyCBcFnz...,https://www.allmusic.com/artist/mn0000574772,,https://www.songkick.com/artists/276130-acdc,https://catalogue.bnf.fr/ark:/12148/cb13901479r,http://d-nb.info/gnd/1221288-X,https://www.instagram.com/acdc/,//www.youtube.com/user/acdcVEVO,https://www.facebook.com/acdc,https://twitter.com/AC_DC,https://www.bandsintown.com/a/26647,,https://genius.com/artists/Ac-dc,https://www.imdb.com/name/nm0009540/,https://www.last.fm/music/AC%2FDC,https://rateyourmusic.com/artist/ac_dc,,https://secondhandsongs.com/artist/1516,https://www.setlist.fm/setlists/acdc-23d6807b....,https://www.whosampled.com/ACDC/,https://www.worldcat.org/identities/lccn-n7801...,https://id.loc.gov/authorities/names/n78011846,https://myspace.com/acdc,,,https://www.musik-sammler.de/artist/ac-dc/,https://music.apple.com/us/artist/5040714,,,https://music.amazon.com/artists/B00136B4M8,,http://musicmoz.org/Bands_and_Artists/A/AC-DC/,,,,,,,,https://nla.gov.au/nla.party-783697,,,https://soundcloud.com/acdcofficial,,,,,,,,,,,http://www.45cat.com/artist/acdc,https://imvdb.com/n/acdc,//listen.tidal.com/artist/945,,,https://www.musixmatch.com/artist/AC-DC,https://us.napster.com/artist/acdc,,,,,,,,,,,,,,https://www.livefans.jp/artists/21822,,,,,,,http://www.spirit-of-metal.com/groupe-groupe-A...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
33539,Single,Satellite Blues,/release-group/9d7d4079-3c30-3352-ba6e-2bb479b...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,https://itunes.apple.com/us/artist/id5040714,https://www.deezer.com/artist/115,https://open.spotify.com/artist/711MCceyCBcFnz...,https://www.allmusic.com/artist/mn0000574772,,https://www.songkick.com/artists/276130-acdc,https://catalogue.bnf.fr/ark:/12148/cb13901479r,http://d-nb.info/gnd/1221288-X,https://www.instagram.com/acdc/,//www.youtube.com/user/acdcVEVO,https://www.facebook.com/acdc,https://twitter.com/AC_DC,https://www.bandsintown.com/a/26647,,https://genius.com/artists/Ac-dc,https://www.imdb.com/name/nm0009540/,https://www.last.fm/music/AC%2FDC,https://rateyourmusic.com/artist/ac_dc,,https://secondhandsongs.com/artist/1516,https://www.setlist.fm/setlists/acdc-23d6807b....,https://www.whosampled.com/ACDC/,https://www.worldcat.org/identities/lccn-n7801...,https://id.loc.gov/authorities/names/n78011846,https://myspace.com/acdc,,,https://www.musik-sammler.de/artist/ac-dc/,https://music.apple.com/us/artist/5040714,,,https://music.amazon.com/artists/B00136B4M8,,http://musicmoz.org/Bands_and_Artists/A/AC-DC/,,,,,,,,https://nla.gov.au/nla.party-783697,,,https://soundcloud.com/acdcofficial,,,,,,,,,,,http://www.45cat.com/artist/acdc,https://imvdb.com/n/acdc,//listen.tidal.com/artist/945,,,https://www.musixmatch.com/artist/AC-DC,https://us.napster.com/artist/acdc,,,,,,,,,,,,,,https://www.livefans.jp/artists/21822,,,,,,,http://www.spirit-of-metal.com/groupe-groupe-A...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
df['has_links'] = (df.spotify.notna()|df.youtube.notna()|df.bandcamp.notna()|df.lastfm.notna()|df.wikipedia.notna())

In [None]:
df.to_pickle(f'{YEAR}/releases.pickle')

In [32]:
df = pd.read_pickle(f'{YEAR}/releases.pickle')

In [30]:
FIXES = {
    'https://open.spotify.com/album/33eUZ9K00JkqFCg8DRNeIC': 'https://open.spotify.com/artist/0JQAeDYaigl5nor0kUP44X',
    'https://open.spotify.com/artist/4LDRJyWHn88OpklMQ0HVQ5': 'https://open.spotify.com/artist/2pktDyTJxrLJo7dL2qT1pA',
    
}

def get_spotify_listeners(link):
    if pd.isna(link) or '/user/' in link:
        return None
    
    link = FIXES.get(link, link)

    resp = requests.get(link)
    page = BeautifulSoup(resp.text)
    listeners = page.find('meta', attrs={'name': 'description'})
    if not listeners:
        print(link)
        return 0
    
    listeners = listeners['content'].split()[-3]


    mult = {'K': 1000, 'M': 1000000}
    if listeners[-1] in mult:
        number = int(float(listeners[:-1]) * mult[listeners[-1]])
    else:
        # print(link)
        number = int(float(listeners))

    return number

def get_lastfm_listeners(link):
    link = '/'.join(link.split('/')[:5])
    if pd.isna(link):
        return None

    resp = requests.get(link)
    page = BeautifulSoup(resp.text)
    listeners = page.find('abbr', class_='js-abbreviated-counter')
    if not listeners:
        print(link)
        return 0
    
    listeners = listeners['title'].replace(',', '')
    return int(listeners)

In [8]:
spotify_links = df[df['spotify'].notna()]['spotify'].values
spotify_listeners = {}

In [28]:
for link in tqdm(spotify_links):
    if link in spotify_listeners:
        continue
    spotify_listeners[link] = get_spotify_listeners(link)

  0%|          | 0/9634 [00:00<?, ?it/s]

https://open.spotify.com/artist/3XsLl5bKLvszmBjKahoye3
https://open.spotify.com/artist/5lntHrcFictDwCXOpdwHE9
https://open.spotify.com/artist/0X20O9P2OiPChrCBYStQPe
https://open.spotify.com/artist/3vfvURSGZedbEUNt3PruCj
https://open.spotify.com/artist/4zZwqW9MVgUhmpcVeFfPEy
https://open.spotify.com/artist/68W5IWYUEsW0ahQS7LPp9x
https://open.spotify.com/artist/5EkZih2ZseQG4hJ2cC0Yew
https://open.spotify.com/artist/7rz2w4y86wgga7Z6nrJKty
https://open.spotify.com/artist/2Mm87pumoq58SN6trDSBdv
https://open.spotify.com/artist/4I9DdwiQL1fPfPsGtjd7p5
https://open.spotify.com/artist/4LDRJyWHn88OpklMQ0HVQ5
https://open.spotify.com/artist/39p2cjqAtQYpPDksq6D4NH
https://open.spotify.com/artist/7lybaCk8qwGc2pl69Hz612
https://open.spotify.com/artist/4u2wOdwLhuXyftoeae69yY
https://open.spotify.com/artist/2p0Waz6D8QKzx1MeSlJEm8
https://open.spotify.com/artist/20NsAIv2s2oL0WPDVYhdC5


In [31]:
reparse = [
    'https://open.spotify.com/artist/7wCjDgV6nqBsHguQXPAaIM',
    'https://open.spotify.com/artist/3Ri72CuuQSCLLkDRJgniFU',
    'https://open.spotify.com/artist/68W5IWYUEsW0ahQS7LPp9x',
    'https://open.spotify.com/artist/7rz2w4y86wgga7Z6nrJKty',
    'https://open.spotify.com/artist/2p0Waz6D8QKzx1MeSlJEm8',
    'https://open.spotify.com/artist/3SXDCIdqI1AR686ukKtKCq',
]

# bi-2 https://open.spotify.com/artist/3SXDCIdqI1AR686ukKtKCq
# dolphin https://open.spotify.com/artist/2pktDyTJxrLJo7dL2qT1pA

for link in tqdm(reparse):
    spotify_listeners[link] = get_spotify_listeners(link)

  0%|          | 0/6 [00:00<?, ?it/s]

In [39]:
with open(f'{YEAR}/spotify.json', 'w') as f:
    f.write(json.dumps(spotify_listeners))

In [37]:
df.loc[df['artist_name']=='Би-2', 'spotify'] = 'https://open.spotify.com/artist/3SXDCIdqI1AR686ukKtKCq'
df[df['artist_name']=='Би-2']['spotify']

4726     https://open.spotify.com/artist/3SXDCIdqI1AR68...
10461    https://open.spotify.com/artist/3SXDCIdqI1AR68...
30874    https://open.spotify.com/artist/3SXDCIdqI1AR68...
Name: spotify, dtype: object

In [41]:
df2 = pd.DataFrame(spotify_listeners.items(), columns=['spotify', 'spotify_listeners'])
df2

Unnamed: 0,spotify,spotify_listeners
0,https://open.spotify.com/artist/2heZLxgJjmZjVP...,70700.0
1,https://open.spotify.com/artist/0HC2dfJHpORLT2...,93100.0
2,https://open.spotify.com/artist/7FQRbf8gbKw8KZ...,175600.0
3,https://open.spotify.com/artist/0kSHocNBxQeP9p...,81600.0
4,https://open.spotify.com/artist/0yaejWkRQYl6PA...,3300.0
...,...,...
4081,https://open.spotify.com/artist/711MCceyCBcFnz...,22800000.0
4082,https://open.spotify.com/artist/3U2U4TR03ZuSts...,99200.0
4083,https://open.spotify.com/artist/1bs7HoMkSyQwco...,5300000.0
4084,https://open.spotify.com/artist/0AuhzXNEVx1LGy...,547.0


In [43]:
df = df.merge(df2, on='spotify', how='left')

In [None]:
lastfm_links = df[df['lastfm'].notna()]['lastfm'].values
lastfm_listeners = {}

In [None]:
for link in tqdm(lastfm_links):
    if link in lastfm_listeners:
        continue
    lastfm_listeners[link] = get_lastfm_listeners(link)

In [None]:
with open(f'{YEAR}/lastfm.json', 'w') as f:
    f.write(json.dumps(lastfm_listeners))

In [None]:
df2 = pd.DataFrame(lastfm_listeners.items(), columns=['lastfm', 'lastfm_listeners'])
df2

In [None]:
df = df.merge(df2, on='lastfm', how='left')

In [50]:
df['listeners'] = np.nanmax(df[['spotify_listeners', 'lastfm_listeners']], axis=1)
df

  df['listeners'] = np.nanmax(df[['spotify_listeners', 'lastfm_listeners']], axis=1)


Unnamed: 0,release_type,release_name,release_link,artist_name,artist_link,discogs,wikipedia,viaf,wikidata,home,...,songfacts,traxsource,rockipedia,kickstarter,gutenberg,theatricalia,has_links,lastfm_listeners,spotify_listeners,listeners
0,Album,Lenka Dusilová,/release-group/2a23335a-5590-30c3-b25c-884cabe...,Lenka Dusilová,/artist/4d60cb9c-81e2-4ae2-9976-b640b545a303,https://www.discogs.com/artist/557781,//en.wikipedia.org/wiki/Lenka_Dusilov%C3%A1,https://viaf.org/viaf/85482923/,//www.wikidata.org/wiki/Q3490011,,...,,,,,,,True,,,
1,Album,Ça parle au diable,/release-group/71058210-8408-3aed-8878-3ad2e4a...,Mes Aïeux,/artist/a320d461-a689-4946-a686-70a6eaebdffb,https://www.discogs.com/artist/2209784,//en.wikipedia.org/wiki/Mes_A%C3%AFeux,,//www.wikidata.org/wiki/Q2567516,http://mesaieux.qc.ca/,...,,,,,,,True,,70700.0,70700.0
2,Album,Afterlife Kingdom,/release-group/3d648595-b55e-3968-a333-390d257...,Satanic Slaughter,/artist/1fd06f01-fb68-43ee-a392-af7336250df1,https://www.discogs.com/artist/278391,,,//www.wikidata.org/wiki/Q2705128,http://hem.passagen.se/ztefdark/,...,,,,,,,False,,,
3,Album,Call Me Al!,/release-group/53dd9a96-e8ea-3200-abee-8911b55...,Alain Caron,/artist/f95594fd-8ba6-4540-8854-7c149153a891,https://www.discogs.com/artist/267096,//en.wikipedia.org/wiki/Alain_Caron_(bassist),https://viaf.org/viaf/37110944/,//www.wikidata.org/wiki/Q714021,https://www.alaincaronofficial.com/,...,,,,,,,True,,,
4,Album,Nowhere Near Here,/release-group/a8ce8cf1-3def-30af-b387-65acd97...,Alex Woodard,/artist/265b1f16-f15c-484f-a6b7-2be20dcf6a9e,https://www.discogs.com/artist/732422,,,,,...,,,,,,,True,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33536,Single,Stiff Upper Lip,/release-group/5259dd42-3636-31c5-9311-d011b25...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,...,,,,,,,True,3189422.0,22800000.0,22800000.0
33537,Album,California Dreamin,/release-group/ed938e59-3c6c-4d64-8df2-55984bd...,The Mamas & the Papas,/artist/ff294730-0315-440d-a543-54005779c15b,https://www.discogs.com/artist/230387,,https://viaf.org/viaf/229179345/,//www.wikidata.org/wiki/Q211277,,...,,,,,,,True,,5300000.0,5300000.0
33538,EP,Stiff Upper Lip,/release-group/61f736e8-b7dd-48c6-b164-1d2f91c...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,...,,,,,,,True,3189422.0,22800000.0,22800000.0
33539,Single,Satellite Blues,/release-group/9d7d4079-3c30-3352-ba6e-2bb479b...,AC/DC,/artist/66c662b6-6e2f-4930-8610-912e24c63ed1,https://www.discogs.com/artist/84752,,https://viaf.org/viaf/144189738/,//www.wikidata.org/wiki/Q27593,http://www.acdc.com/,...,,,,,,,True,3189422.0,22800000.0,22800000.0


In [51]:
df.to_pickle(f'{YEAR}/releases.pickle')
df.fillna('').to_excel(f'{YEAR}/releases.xlsx')