In [3]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import re
from bs4 import BeautifulSoup

In [17]:
from pathlib import Path
import requests_cache
import time
from IPython.core.display import clear_output

In [18]:
cwd = Path.cwd()
path_to_config = cwd / "Inputs/configpp.yml"

In [19]:
#Loading config file

import yaml
with open(path_to_config, 'r') as ymlfile:
    config = yaml.load(ymlfile)

  """


In [20]:
#API key extraction
apikey = config['lastfm'].get('apikey')

In [333]:
# Countries dataframe
countriesdf = pd.read_csv("countries_cut_one.csv", sep =";", encoding='latin-1')
countriesdf.columns = ['CountryName', 'CountryNameUrl']
countriesdf['CountryId'] = range(1, countriesdf.shape[0] + 1)
countriesdf = countriesdf[['CountryId', 'CountryName', 'CountryNameUrl']]

In [367]:
countriesdf = countriesdf.head(n = 10)

In [368]:
countriesdf

Unnamed: 0,CountryId,CountryName,CountryNameUrl
0,1,Taiwan,Taiwan
1,2,Afghanistan,Afghanistan
2,3,Albania,Albania
3,4,Algeria,Algeria
4,5,American Samoa,American+Samoa
5,6,Andorra,Andorra
6,7,Angola,Angola
7,8,Anguilla,Anguilla
8,9,Antarctica,Antarctica
9,10,Antigua and Barbuda,Antigua+and+Barbuda


# LastFMLink object
# chci linkssss


In [369]:
# LastFMLink object

class LastFMLink:
    '''
    LastFM object working with LastFM API accessible links. API key has to be provided.
    '''
    
    
    def __init__(self, apikey, format = 'json', limit = 50):
        self.apikey = apikey
        self.limit = limit
        if format == 'json':
            self.format = 'json'
    
    
    def GeoGetTop(self, method = 'artist', countries = 'Czech+republic'):
        '''
        Obtain links containing info about most popular artists/tracks in given countries. 
        Countries: A list of country names, as defined by the ISO 3166-1 country names standard, multi-word names shall be seperated by '+'.
        Method: Either 'artist' or 'track' in order to obtain most popular artists or tracks in given countries.
        '''
        if method == 'artist':
            self.method = 'geo.gettopartists'
            if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&country=' + countries + '&api_key=' + self.apikey + '&format=' + self.format
                return(self.links)
            
        elif method == 'track':
            self.method = 'geo.gettoptracks'
            if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&country=' + countries + '&api_key=' + self.apikey + '&format=' + self.format
                return self.links  
            
        else:
            print('Choose either "artist" or "track" method, in order obtain most popular artists or tracks, respectively.')
    
    
    def ArtistGetInfo(self, artists = 'beach+house', autocorrect = 0):
        '''
        Obtain links containing info about given artists. Autocorrect transforms misspelled artist names into correct artist names, options are 1 for the enablement of autocorrect and 0 for otherwise.
        '''
        self.method = 'artist.getinfo'
        if self.format == 'json':
                self.links = 'https://ws.audioscrobbler.com/2.0/?method=' + self.method + '&limit=' + str(self.limit) + '&artist=' + artists + '&api_key=' + self.apikey + '&format=' + self.format + '&autocorrect=' + str(autocorrect)
                return self.links

## Creation of a dictionary containing links to jsonfiles

In [370]:
# Links dictionary
last_fm_links = {'geo_top_artists': 0, 'geo_top_tracks': 0, 'artists_info': 0}

In [371]:
# We would like 101 entries from each file
lnk = LastFMLink(apikey, limit = 101)

In [372]:
# Creating links to access most popular artists and tracks on the country level
last_fm_links['geo_top_artists'] = lnk.GeoGetTop(countries = countriesdf['CountryNameUrl'], method = 'artist')
last_fm_links['geo_top_tracks'] = lnk.GeoGetTop(countries = countriesdf['CountryNameUrl'], method = 'track')

In [392]:
class LastFMDownloader: 
    '''
    Downloader class for collection of data and storage of results.
    '''
    
    
    def __init__(self, allowLog = True):
        '''
        Initilization of Downloader object. API key has to be provided. 
        '''
        self.allowLog = allowLog
        self.limit = 999
        self.jsonlist = []
        if self.allowLog:
            print('Downloader initialized.')
    
    
    def LoadCountryList(self, countryid = 0, countryname = 'Czech Republic'):
        '''
        Specifies country id and country names of GetDfGeo methods.
        '''
        self.countryid = countryid
        self.countryname = countryname
    
    
    def RequestJson(self, links):
        '''
        Requests JSON files from given links and returns a list of JSON files stored in jsonlist attribute of the downloader.
        '''
        ## dodelat provide 1 link only
        requests_cache.install_cache() # storing previous requests
        number_of_files = len(links)
        self.jsonlist = []
        if len(links) >= 2:
            file = 1
            for link in links:
                print('Requesting file number {} out of {}'.format(file, number_of_files))
                clear_output(wait = True)
                jsonfile = requests.get(link).json()
                self.jsonlist.append(jsonfile)
                if not getattr(jsonfile, 'from_cache', False):
                    print('File has not been requested yet. Please wait half a second.')
                    time.sleep(0.25)
                file = file + 1
            clear_output(wait = True)
            print("All requested files are now available and stored in the 'jsonlist' attribute.")
        
    
    def GetDfGeoTopArtists(self): 
        '''
        Provides data concerning most popular artists in given countries stored in a pandas dataframe. 
        LoadCountryList() method has to be called before-hand, in order to specify id of countries and country names.
        '''
        #self.DfGeoTopArtists = pd.DataFrame(columns=['CountryId', 'Country', 'Rank', 'Artist', 'ArtistId', 'Listeners'])
        #self.DfGeoTopTracks = pd.DataFrame(columns=['CountryId', 'Country', 'Rank', 'Track', 'Duration', 'Artist', 'ArtistId'])
        #self.DfArtistInfo = pd.DataFrame(columns=['ArtistId', 'Artist', 'Listeners', 'Scrobbles', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5'])
        ListGeoTopArtists = []
        for country in range(len(self.countryid)): # x number of countries range len countries
            bad_entry = 0
            for ranking in range(self.limit): # z number of ranks
                try:
                    row = []
                    rank = ranking + 1
                    countryid = self.countryid[country]
                    countryname = self.countryname[country]
                    artist = self.jsonlist[country]['topartists']['artist'][ranking]['name']
                    artistid = self.jsonlist[country]['topartists']['artist'][ranking]['mbid']
                    listeners = self.jsonlist[country]['topartists']['artist'][ranking]['listeners']
                    row.extend([countryid, countryname, rank, artist, artistid, listeners])
                    ListGeoTopArtists.append(row)
                except:
                    if bad_entry > 0:
                        break
                    else:
                        countryid = self.countryid[country]
                        countryname = self.countryname[country]
                        row = [countryid, countryname]
                        row.extend([None] * 4)
                        ListGeoTopArtists.append(row)
                        bad_entry = bad_entry + 1
        self.DfGeoTopArtists = pd.DataFrame.from_records(ListGeoTopArtists)
        self.DfGeoTopArtists.columns = ['CountryId', 'Country', 'Rank', 'Artist', 'ArtistId', 'Listeners']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopArtists' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfGeoTopArtists.tail()
    
    
    def GetDfGeoTopTracks(self): 
        '''
        Provides data concerning most popular tracks in given countries stored in a pandas dataframe. 
        LoadCountryList() method has to be called before-hand, in order to specify id of countries and country names.
        '''
        ListGeoTopTracks = []
        for country in range(len(self.countryid)): # x number of countries range len countries
            bad_entry = 0
            for ranking in range(self.limit): # z number of ranks
                try:
                    row = []
                    rank = ranking + 1
                    countryid = self.countryid[country]
                    countryname = self.countryname[country]
                    track = self.jsonlist[country]['tracks']['track'][ranking]['name']
                    duration = self.jsonlist[country]['tracks']['track'][ranking]['duration']
                    artist = self.jsonlist[country]['tracks']['track'][ranking]['artist']['name']
                    artistid = self.jsonlist[country]['tracks']['track'][ranking]['artist']['mbid']
                    row.extend([countryid, countryname, rank, track, duration, artist, artistid])
                    ListGeoTopTracks.append(row)
                except:
                    if bad_entry > 0:
                        break
                    else:
                        countryid = self.countryid[country]
                        countryname = self.countryname[country]
                        row = [countryid, countryname]
                        row.extend([None] * 5)
                        ListGeoTopTracks.append(row)
                        bad_entry = bad_entry + 1
        self.DfGeoTopTracks = pd.DataFrame.from_records(ListGeoTopTracks)
        self.DfGeoTopTracks.columns = ['CountryId', 'Country', 'Rank', 'Track', 'Duration', 'Artist', 'ArtistId']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopTracks' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfGeoTopTracks.tail()
    
    
    def GetDfArtistInfo(self): 
        '''
        Provides data concerning most popular artists in given countries stored in a pandas dataframe. 
        LoadCountryList() method has to be called before-hand, in order to specify id of countries and country names.
        '''
        #self.DfArtistInfo = pd.DataFrame(columns=['CountryId', 'Country', 'Rank', 'Artist', 'ArtistId', 'Listeners'])
        #self.DfGeoTopTracks = pd.DataFrame(columns=['CountryId', 'Country', 'Rank', 'Track', 'Duration', 'Artist', 'ArtistId'])
        #self.DfArtistInfo = pd.DataFrame(columns=['ArtistId', 'Artist', 'Listeners', 'Scrobbles', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5'])
        ListArtistInfo = []
        for artist in range(len(self.jsonlist)): # x number of countries range len countries
            try:
                row = []
                artistname = self.jsonlist[artist]['artist']['name']
                listeners = self.jsonlist[artist]['artist']['stats']['listeners']
                playcount = self.jsonlist[artist]['artist']['stats']['playcount']
                tag1 = self.jsonlist[artist]['artist']['tags']['tag'][0]['name']
                tag2 = self.jsonlist[artist]['artist']['tags']['tag'][1]['name']
                tag3 = self.jsonlist[artist]['artist']['tags']['tag'][2]['name']
                tag4 = self.jsonlist[artist]['artist']['tags']['tag'][3]['name']
                tag5 = self.jsonlist[artist]['artist']['tags']['tag'][4]['name']
                try:
                    artistid = self.jsonlist[artist]['artist']['mbid']
                except:
                    artistid = ''
                row.extend([artistid, artistname, listeners, playcount, tag1, tag2, tag3, tag4, tag5])
                ListArtistInfo.append(row)
            except:
                pass
        self.DfArtistInfo = pd.DataFrame.from_records(ListArtistInfo)
        self.DfArtistInfo.columns = ['ArtistId', 'Artist', 'Listeners', 'Scrobbles', 'Tag1', 'Tag2', 'Tag3', 'Tag4', 'Tag5']
        print("Data concerning most popular artists in given countries are now available and stored in 'DfArtistInfo' attribute as a pandas dataframe.")
        print("Here are last 5 entries of the dataframe:")
        return self.DfArtistInfo.tail()

tady pouzivam ty fajnovy veci co jsem si definoval nahore
<br>
<br>
<br>
<br>
<br>
<br>
<br>

In [393]:
dwn = LastFMDownloader(apikey)

Downloader initialized.


In [394]:
# Setting number of inputs per jsonfile to be the very same
dwn.limit = lnk.limit

In [395]:
dwn.LoadCountryList(countryid = countriesdf['CountryId'], countryname = countriesdf['CountryName'])

In [396]:
dwn.RequestJson(links = last_fm_links['geo_top_artists'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [397]:
dwn.GetDfGeoTopArtists()

Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopArtists' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,CountryId,Country,Rank,Artist,ArtistId,Listeners
905,10,Antigua and Barbuda,97.0,Beach House,d5cc67b8-1cc4-453b-96e8-44487acdebea,1267582
906,10,Antigua and Barbuda,98.0,Fleetwood Mac,bd13909f-1c29-4c27-a874-d4aaf27c5b1a,2246317
907,10,Antigua and Barbuda,99.0,Twenty One Pilots,a6c6897a-7415-4f8d-b5a5-3a5e05f3be67,852513
908,10,Antigua and Barbuda,100.0,Foals,6a65d878-fcd0-42cf-aff9-ca1d636a8bcc,1508787
909,10,Antigua and Barbuda,101.0,Grimes,7e5a2a59-6d9f-4a17-b7c2-e1eedb7bd222,942100


In [399]:
dwn.RequestJson(links = last_fm_links['geo_top_tracks'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [400]:
dwn.GetDfGeoTopTracks()

Data concerning most popular artists in given countries are now available and stored in 'DfGeoTopTracks' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,CountryId,Country,Rank,Track,Duration,Artist,ArtistId
905,10,Antigua and Barbuda,97.0,Radioactive,187,Imagine Dragons,012151a8-0f9a-44c9-997f-ebd68b5389f9
906,10,Antigua and Barbuda,98.0,Pumped Up Kicks,236,Foster the People,e0e1a584-dd0a-4bd1-88d1-c4c62895039d
907,10,Antigua and Barbuda,99.0,Rolling in the Deep,229,Adele,cc2c9c3c-b7bc-4b8b-84d8-4fbd8779e493
908,10,Antigua and Barbuda,100.0,The Scientist,309,Coldplay,cc197bad-dc9c-440d-a5b5-d52ba2e14234
909,10,Antigua and Barbuda,101.0,Somebody That I Used To Know,294,Gotye,6f6fd596-76e0-4b82-aa37-f558ac2d337b


In [401]:
# Creating links to access info about most popular artists on the global level
ArtistsArray = np.append(dwn.DfGeoTopArtists.Artist.unique().astype(str), dwn.DfGeoTopTracks.Artist.unique().astype(str))
ArtistsArray = np.unique(ArtistsArray)
ListOfArtistsUrl = [re.sub('\s+', '+', str(x)) for x in ArtistsArray]
last_fm_links['artists_info'] = LastFMLink(apikey).ArtistGetInfo(artists = pd.Series(ListOfArtistsUrl))

In [402]:
dwn.RequestJson(links = last_fm_links['artists_info'])

All requested files are now available and stored in the 'jsonlist' attribute.


In [404]:
dwn.GetDfArtistInfo()

Data concerning most popular artists in given countries are now available and stored in 'DfArtistInfo' attribute as a pandas dataframe.
Here are last 5 entries of the dataframe:


Unnamed: 0,ArtistId,Artist,Listeners,Scrobbles,Tag1,Tag2,Tag3,Tag4,Tag5
476,03a265bd-30be-4929-a220-0f583cc5160a,蔡健雅,46430,1322830,chinese,pop,singapore,Tanya Chua,mandarin
477,a8fd2a44-2f72-47ea-a6ce-96e8bd7059ec,蘇打綠,20459,858972,taiwan,indie,indie pop,chinese,pop
478,86119d30-d930-4e65-a97a-e31e22388166,陳奕迅,70272,5542236,chinese,Hong Kong,pop,cantonese,C-pop
479,3a71842d-b291-4991-a30d-9d8328a26d3e,陳綺貞,22755,1925018,indie,indie pop,chinese,taiwan,folk
480,3821e3ac-4d91-40b8-a669-f58d1fe2c0c4,魏如萱,11307,422174,indie,taiwanese,chinese,indie pop,taiwan


In [410]:
dwn.DfArtistInfo[dwn.DfArtistInfo.Artist == 'Alan Walker']

Unnamed: 0,ArtistId,Artist,Listeners,Scrobbles,Tag1,Tag2,Tag3,Tag4,Tag5
12,,Alan Walker,479116,9924536,electronic,House,EDM,dance,norwegian


In [1304]:
# Saving all dataframes to csv
dwn.DfGeoTopTracks.to_csv('geo_top_tracks.csv', encoding='utf-8-sig')
dwn.DfGeoTopArtists.to_csv('geo_top_artists.csv', encoding='utf-8-sig')
dwn.DfArtistInfo.to_csv('artist_info.csv', encoding='utf-8-sig')

# dalsi shity

In [680]:
pd.read_json(json.dumps(dwn.jsonlist)).iloc[1,0]['@attr']['country']

'Curaçao'

In [740]:
# for loop x in range len countries - vezmu vsechny zeme
# for loop z in range(self.limit) - vezmu top XX artistu
x = 2
z = 1
pd.read_json(json.dumps(dwn.jsonlist)).iloc[x,0]['artist'][z]['name']

'Queen'

In [687]:
pd.read_json(json.dumps(dwn.jsonlist)).iloc[x,0]['artist'][z]['mbid']

'420ca290-76c5-41af-999e-564d7c71f1a7'

In [688]:
pd.read_json(json.dumps(dwn.jsonlist)).iloc[x,0]['artist'][z]['listeners']

'4139663'

In [690]:
pd.read_json(json.dumps(dwn.jsonlist))

Unnamed: 0,topartists
0,"{'artist': [{'name': 'The Beatles', 'listeners..."
1,"{'artist': [], '@attr': {'country': 'Curaçao',..."
2,"{'artist': [{'name': 'Coldplay', 'listeners': ..."


In [694]:
ij = pd.read_json(json.dumps(dwn.jsonlist)).iloc[x,0]['artist'][z]

In [695]:
ij['name']

'Queen'

In [816]:
topArtists = pd.DataFrame(columns=['CountryId', 'Country', 'Rank', 'Artist', 'ArtistId', 'Listeners'])
table = []
for z in dwn.jsonlist:           
    try:
        row = []
        artist = dwn.jsonlist[x]['topartists']['artist'][z]['name']
        rank = dwn.jsonlist[x]['topartists']['artist'][z]['mbid']
        listeners = dwn.jsonlist[x]['topartists']['artist'][z]['listeners']
        row.append(rank, artist, listeners)
        table.append(row)
    except:
        pass
print(table)

[]


In [872]:
table = []
z = 1
## geo top tracks
ListGeoTopArtists = []
for country in range(3): # x number of countries range len countries
    print(dwn.countryid[country])
    print(dwn.countryname[country])
    for ranking in range(dwn.limit): # z number of ranks
        try:
            row = []
            rank = ranking + 1
            countryid = dwn.countryid[country]
            countryname = dwn.countryname[country]
            artist = dwn.jsonlist[country]['topartists']['artist'][ranking]['name']
            artistid = dwn.jsonlist[country]['topartists']['artist'][ranking]['mbid']
            listeners = dwn.jsonlist[country]['topartists']['artist'][ranking]['listeners']
            row.extend([countryid, countryname, rank, artist, artistid, listeners])
            ListGeoTopArtists.append(row)
        except:
            row = [None] * 6
            ListGeoTopArtists.append(row)
#return self. = pd.DataFrame.from_records(ListGeoTopArtists)
    #ListGeoTopArtists.append(ListCountryTopArtists)
#print(ListCountryTopArtists)
#print(ListGeoTopArtists)

1
Taiwan
2
Afghanistan
3
Albania


In [873]:
ListGeoTopArtists

[[1,
  'Taiwan',
  1,
  'The Beatles',
  'b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d',
  '3783733'],
 [1,
  'Taiwan',
  2,
  'Nirvana',
  '9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6',
  '4362266'],
 [1,
  'Taiwan',
  3,
  'Daft Punk',
  '056e4f3e-d505-4dad-8ec1-d04f521cbb56',
  '3864620'],
 [1, 'Taiwan', 4, 'U2', 'a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432', '3547039'],
 [1, 'Taiwan', 5, 'Sia', '2f548675-008d-4332-876c-108b0c7ab9c5', '2214644'],
 [1,
  'Taiwan',
  6,
  'Fleetwood Mac',
  'bd13909f-1c29-4c27-a874-d4aaf27c5b1a',
  '2246300'],
 [1,
  'Taiwan',
  7,
  'Justin Timberlake',
  '596ffa74-3d08-44ef-b113-765d43d12738',
  '2894937'],
 [1,
  'Taiwan',
  8,
  'Elton John',
  'b83bc61f-8451-4a5d-8b8e-7e9ed295e822',
  '2612368'],
 [1,
  'Taiwan',
  9,
  'The Police',
  '9e0e2b01-41db-4008-bd8b-988977d6019a',
  '2349324'],
 [1,
  'Taiwan',
  10,
  'Jason Derülo',
  '6de0f914-3e60-4418-be3b-42e0feb6eb4d',
  '1932949'],
 [1, 'Taiwan', 11, 'Cardi B', '', '402655'],
 [1,
  'Taiwan',
  12,
  'Arctic Monkeys

In [874]:
df = pd.DataFrame.from_records(ListGeoTopArtists)

In [875]:
df

Unnamed: 0,0,1,2,3,4,5
0,1.0,Taiwan,1.0,The Beatles,b10bbbfc-cf9e-42e0-be17-e2c3e1d2600d,3783733
1,1.0,Taiwan,2.0,Nirvana,9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6,4362266
2,1.0,Taiwan,3.0,Daft Punk,056e4f3e-d505-4dad-8ec1-d04f521cbb56,3864620
3,1.0,Taiwan,4.0,U2,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,3547039
4,1.0,Taiwan,5.0,Sia,2f548675-008d-4332-876c-108b0c7ab9c5,2214644
5,1.0,Taiwan,6.0,Fleetwood Mac,bd13909f-1c29-4c27-a874-d4aaf27c5b1a,2246300
6,1.0,Taiwan,7.0,Justin Timberlake,596ffa74-3d08-44ef-b113-765d43d12738,2894937
7,1.0,Taiwan,8.0,Elton John,b83bc61f-8451-4a5d-8b8e-7e9ed295e822,2612368
8,1.0,Taiwan,9.0,The Police,9e0e2b01-41db-4008-bd8b-988977d6019a,2349324
9,1.0,Taiwan,10.0,Jason Derülo,6de0f914-3e60-4418-be3b-42e0feb6eb4d,1932949


In [879]:
df.iloc[50,:]

0     NaN
1    None
2     NaN
3    None
4    None
5    None
Name: 50, dtype: object

In [846]:
for i in range(3):
    print(i)


0
1
2


In [774]:
z = 1
for x in range(3):
    try:
        z = 1
        print(x)
        print(dwn.jsonlist[x]['topartists']['artist'][z]['name'])
    except:
        pass

0
Nirvana
1
2
Queen


In [773]:
dwn.jsonlist[0]['topartists']['artist'][1]['name']

'Nirvana'

In [779]:
%%timeit
pd.read_json(json.dumps(dwn.jsonlist)).iloc[x,0]['artist'][z]['name']

3.76 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [780]:
%%timeit
dwn.jsonlist[x]['topartists']['artist'][z]['name']

216 ns ± 66.8 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [592]:
LastFMLink(apikey).GeoGetTop(countries = countriesdf['CountryNameUrl'], method = 'artist')[57]

'https://ws.audioscrobbler.com/2.0/?method=geo.gettopartists&limit=50&country=Czech+republic&api_key=24a6e38c235025362537c2bcd85e66d9&format=json'