In [437]:
import requests
import pandas as pd
import numpy as np
import json
import re

In [17]:
from pathlib import Path
import requests_cache
import time
from IPython.core.display import clear_output

In [18]:
## Please insert your API key into the configuration file in order to access LastFM public API
cwd = Path.cwd()
path_to_config = cwd / "Inputs/configpp.yml"

In [412]:
#Loading config file

import yaml
with open(path_to_config, 'r') as ymlfile:
    config = yaml.load(ymlfile)

  """


In [413]:
#API key extraction
apikey = config['lastfm'].get('apikey')

In [414]:
# Countries dataframe
countriesdf = pd.read_csv("countries_cut_one.csv", sep =";", encoding='latin-1')
countriesdf.columns = ['CountryName', 'CountryNameUrl']
countriesdf['CountryId'] = range(1, countriesdf.shape[0] + 1)
countriesdf = countriesdf[['CountryId', 'CountryName', 'CountryNameUrl']]

# LastFMLink object

In [417]:
# LastFMLink object

class LastFMLink:
    '''
    LastFM object working with LastFM API accessible links. API key has to be provided.
    '''
    
    
    def __init__(self, apikey, format = 'json', limit = 50):
        self.apikey = apikey
        self.limit = limit
        if format == 'json':
            self.format = 'json'
    
    
    def GeoGetTop(self, method = 'artist', countries = 'Czech+republic'):
        '''
        Obtain links containing info about most popular artists/tracks in given countries. 
        Countries: A list of country names, as defined by the ISO 3166-1 country names standard, multi-word names shall be seperated by '+'.
        Method: Either 'artist' or 'track' in order to obtain most popular artists or tracks in given countries.
        '''
        if method == 'artist':
            self.method = 'geo.gettopartists'
            if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&country=' + countries + '&api_key=' + self.apikey + '&format=' + self.format
                return(self.links)
            
        elif method == 'track':
            self.method = 'geo.gettoptracks'
            if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&country=' + countries + '&api_key=' + self.apikey + '&format=' + self.format
                return self.links  
            
        else:
            print('Choose either "artist" or "track" method, in order obtain most popular artists or tracks, respectively.')
    
    
    def ArtistGetInfo(self, artists = 'beach+house', autocorrect = 0):
        '''
        Obtain links containing info about given artists. Autocorrect transforms misspelled artist names into correct artist names, options are 1 for the enablement of autocorrect and 0 for otherwise.
        '''
        self.method = 'artist.getinfo'
        if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&artist=' + artists + '&api_key=' + self.apikey + '&format=' + self.format + '&autocorrect=' + str(autocorrect)
                return self.links

## Creation of a dictionary containing links to jsonfiles

In [418]:
# Links dictionary
last_fm_links = {'geo_top_artists': 0, 'geo_top_tracks': 0, 'artists_info': 0}

In [419]:
# We would like 101 entries from each file
lnk = LastFMLink(apikey, limit = 101)

In [420]:
# Creating links to access most popular artists and tracks on the country level
last_fm_links['geo_top_artists'] = lnk.GeoGetTop(countries = countriesdf['CountryNameUrl'], method = 'artist')
last_fm_links['geo_top_tracks'] = lnk.GeoGetTop(countries = countriesdf['CountryNameUrl'], method = 'track')

In [436]:
class LastFMDownloader: 
    '''
    Downloader class for collection of data and storage of results.
    '''
    
    
    def __init__(self, allowLog = True):
        '''
        Initilization of Downloader object. API key has to be provided. 
        '''
        self.allowLog = allowLog
        self.limit = 50
        self.jsonlist = []
        if self.allowLog:
            print('Downloader initialized.')
    
    
    def LoadCountryList(self, countryid = 0, countryname = 'Czech Republic'):
        '''
        Specifies country id and country names of GetDfGeo methods.
        '''
        self.countryid = countryid
        self.countryname = countryname
    
    
    def RequestJson(self, links):
        '''
        Requests JSON files from given links and returns a list of JSON files stored in jsonlist attribute of the downloader.
        '''
        ## dodelat provide 1 link only
        requests_cache.install_cache() # storing previous requests
        number_of_files = len(links)
        self.jsonlist = []
        if len(links) >= 2:
            file = 1
            for link in links:
                print('Requesting file number {} out of {}'.format(file, number_of_files))
                clear_output(wait = True)
                jsonfile = requests.get(link).json()
                self.jsonlist.append(jsonfile)
                if not getattr(jsonfile, 'from_cache', False):
                    print('File has not been requested yet. Please wait half a second.')
                    time.sleep(0.25)
                file = file + 1
            clear_output(wait = True)
            print("All requested files are now available and stored in the 'jsonlist' attribute. Use one of the 'GetDf' methods to obtain the relevant data.")
        
    
    def GetDfGeoTopArtists(self): 
        '''
        Provides data concerning most popular artists in given countries stored in a pandas dataframe. 
        LoadCountryList() method has to be called before-hand, in order to specify id of countries and country names.
        '''
        ListGeoTopArtists = []
        for country in range(len(self.countryid)): # x number of countries range len countries
            bad_entry = 0
            for ranking in range(self.limit): # z number of ranks
                try:
                    row = []
                    rank = ranking + 1
                    countryid = self.countryid[country]
                    countryname = self.countryname[country]
                    artist = self.jsonlist[country]['topartists']['artist'][ranking]['name']
                    artistid = self.jsonlist[country]['topartists']['artist'][ranking]['mbid']
                    listeners = self.jsonlist[country]['topartists']['artist'][ranking]['listeners']
                    row.extend([countryid, countryname, rank, artist, artistid, listeners])
                    ListGeoTopArtists.append(row)
                except:
                    if bad_entry > 0:
                        break
                    else:
                        countryid = self.countryid[country]
                        countryname = self.countryname[country]
                        row = [countryid, countryname]
                        row.extend([None] * 4)
                        ListGeoTopArtists.append(row)
                        bad_entry = bad_entry + 1
        self.DfGeoTopArtists = pd.DataFrame.from_records(ListGeoTopArtists)
        self.DfGeoTopArtists.columns = ['CountryId', 'Country', 'Rank', 'Artist', 'ArtistId', 'Listeners']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopArtists' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfGeoTopArtists.tail()
    
    
    def GetDfGeoTopTracks(self): 
        '''
        Provides data concerning most popular tracks in given countries stored in a pandas dataframe. 
        LoadCountryList() method has to be called before-hand, in order to specify id of countries and country names.
        '''
        ListGeoTopTracks = []
        for country in range(len(self.countryid)): # x number of countries range len countries
            bad_entry = 0
            for ranking in range(self.limit): # z number of ranks
                try:
                    row = []
                    rank = ranking + 1
                    countryid = self.countryid[country]
                    countryname = self.countryname[country]
                    track = self.jsonlist[country]['tracks']['track'][ranking]['name']
                    duration = self.jsonlist[country]['tracks']['track'][ranking]['duration']
                    artist = self.jsonlist[country]['tracks']['track'][ranking]['artist']['name']
                    artistid = self.jsonlist[country]['tracks']['track'][ranking]['artist']['mbid']
                    row.extend([countryid, countryname, rank, track, duration, artist, artistid])
                    ListGeoTopTracks.append(row)
                except:
                    if bad_entry > 0:
                        break
                    else:
                        countryid = self.countryid[country]
                        countryname = self.countryname[country]
                        row = [countryid, countryname]
                        row.extend([None] * 5)
                        ListGeoTopTracks.append(row)
                        bad_entry = bad_entry + 1
        self.DfGeoTopTracks = pd.DataFrame.from_records(ListGeoTopTracks)
        self.DfGeoTopTracks.columns = ['CountryId', 'Country', 'Rank', 'Track', 'Duration', 'Artist', 'ArtistId']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopTracks' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfGeoTopTracks.tail()
    
    
    def GetDfArtistInfo(self): 
        '''
        Provides data concerning artists stored in a pandas dataframe. 
        '''
        ListArtistInfo = []
        for artist in range(len(self.jsonlist)): # x number of countries range len countries
            try:
                row = []
                artistname = self.jsonlist[artist]['artist']['name']
                listeners = self.jsonlist[artist]['artist']['stats']['listeners']
                playcount = self.jsonlist[artist]['artist']['stats']['playcount']
                tag1 = self.jsonlist[artist]['artist']['tags']['tag'][0]['name']
                tag2 = self.jsonlist[artist]['artist']['tags']['tag'][1]['name']
                tag3 = self.jsonlist[artist]['artist']['tags']['tag'][2]['name']
                tag4 = self.jsonlist[artist]['artist']['tags']['tag'][3]['name']
                tag5 = self.jsonlist[artist]['artist']['tags']['tag'][4]['name']
                try:
                    artistid = self.jsonlist[artist]['artist']['mbid']
                except:
                    artistid = ''
                row.extend([artistid, artistname, listeners, playcount, tag1, tag2, tag3, tag4, tag5])
                ListArtistInfo.append(row)
            except:
                pass
        self.DfArtistInfo = pd.DataFrame.from_records(ListArtistInfo)
        self.DfArtistInfo.columns = ['ArtistId', 'Artist', 'Listeners', 'Scrobbles', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfArtistInfo' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfArtistInfo.tail()

tady pouzivam ty fajnovy veci co jsem si definoval nahore
<br>
<br>
<br>
<br>
<br>
<br>
<br>

In [422]:
dwn = LastFMDownloader(apikey)

Downloader initialized.


In [423]:
# Setting number of inputs per jsonfile to be the very same
dwn.limit = lnk.limit

In [424]:
dwn.LoadCountryList(countryid = countriesdf['CountryId'], countryname = countriesdf['CountryName'])

In [425]:
dwn.RequestJson(links = last_fm_links['geo_top_artists'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [426]:
dwn.GetDfGeoTopArtists()

Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopArtists' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,CountryId,Country,Rank,Artist,ArtistId,Listeners
19612,236,Zimbabwe,97.0,Janet Jackson,6be2828f-6c0d-4059-99d4-fa18acf1a296,1317931
19613,236,Zimbabwe,98.0,Scissor Sisters,4236d929-9a81-4c8e-97c3-8d3306780f50,1265642
19614,236,Zimbabwe,99.0,Roxette,d3b2711f-2baa-441a-be95-14945ca7e6ea,1188769
19615,236,Zimbabwe,100.0,Destiny's Child,a796b92e-c137-4895-9c89-10f900617a4f,1865668
19616,236,Zimbabwe,101.0,Colbie Caillat,efc8a006-d0c6-4a9b-8cb1-91ca770fa2b9,1445146


In [427]:
dwn.RequestJson(links = last_fm_links['geo_top_tracks'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [428]:
dwn.GetDfGeoTopTracks()

Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopTracks' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,CountryId,Country,Rank,Track,Duration,Artist,ArtistId
19645,236,Zimbabwe,97.0,breathin,0,Ariana Grande,f4fdbb4c-e4b7-47a0-b83b-d91bbfcfa387
19646,236,Zimbabwe,98.0,Shut Up and Dance,199,Walk the Moon,d4aad415-9cd0-4845-9b05-0416fdcc9fc4
19647,236,Zimbabwe,99.0,Delicate,0,Taylor Swift,20244d07-534f-4eff-b4d4-930878889970
19648,236,Zimbabwe,100.0,Single Ladies (Put a Ring on It),195,Beyoncé,859d0860-d480-4efd-970c-c05d5f1776b8
19649,236,Zimbabwe,101.0,I Want You Back,180,The Jackson 5,e5257dc5-1edd-4fca-b7e6-1158e00522c8


In [429]:
# Creating links to access info about most popular artists on the global level
ArtistsArray = np.append(dwn.DfGeoTopArtists.Artist.unique().astype(str), dwn.DfGeoTopTracks.Artist.unique().astype(str))
ArtistsArray = np.unique(ArtistsArray)
ListOfArtistsUrl = [re.sub('\s+', '+', str(x)) for x in ArtistsArray]
last_fm_links['artists_info'] = LastFMLink(apikey).ArtistGetInfo(artists = pd.Series(ListOfArtistsUrl))

In [431]:
dwn.RequestJson(links = last_fm_links['artists_info'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [432]:
dwn.GetDfArtistInfo()

Data concerning most popular artists in given countries are now available and stored in 'DfArtistInfo' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,ArtistId,Artist,Listeners,Scrobbles,Tag1,Tag2,Tag3,Tag4,Tag5
2130,86119d30-d930-4e65-a97a-e31e22388166,陳奕迅,70272,5542236,chinese,Hong Kong,pop,cantonese,C-pop
2131,3a71842d-b291-4991-a30d-9d8328a26d3e,陳綺貞,22755,1925018,indie,indie pop,chinese,taiwan,folk
2132,176650bf-db9c-48dd-8c01-376104966997,電気グルーヴ,45454,2499786,techno,japanese,electronic,electronica,dance
2133,3821e3ac-4d91-40b8-a669-f58d1fe2c0c4,魏如萱,11307,422174,indie,taiwanese,chinese,indie pop,taiwan
2134,a13a4783-fb66-4687-8ba7-394637ccd9e0,麥浚龍,5754,408910,Hong Kong,pop,cantonese,chinese,juno mak


In [434]:
# Saving all dataframes to csv
dwn.DfGeoTopTracks.to_csv('geo_top_tracks.csv', encoding='utf-8-sig')
dwn.DfGeoTopArtists.to_csv('geo_top_artists.csv', encoding='utf-8-sig')
dwn.DfArtistInfo.to_csv('artist_info.csv', encoding='utf-8-sig')