# Dataset preparation
After collecting a lot of data from different sources it must have been merged to create one dataset, useful for machine learning tasks. It has been done in few iterations, depending on the available type of information. As pointed out before some artists had both Musicbrainz and Spotify IDs, so it was very easy to match those entries from two databases. For the rest of them more complicated approach needed to be applied: first, similarity between names was computed and sets of the artists with most similar names were created. In each of those sets (usually 5-10 artists) similarities between albums' names were computed and the artist entries with the biggest number of albums with similarities above some predefined threshold were connected to each other. 

In [4]:
from pymongo import MongoClient
import time

import spotipy
import spotipy.oauth2 as oauth2
import spotipy.util as util

import pandas as pd
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.feature_extraction.text import TfidfTransformer



from IPython.display import clear_output

In [6]:
db_client = MongoClient('localhost', 27017)
db = db_client.musicdata

In [7]:
spotify_artists = db.get_collection("artists_spotify_bynames")
spotify_artists_direct = db.get_collection("artist_spotify")
spotify_albums = db.get_collection("albums_spotify")
spotify_artist_with_albums = db.get_collection("spotify_artist_albums")

musicbrainz_albums = db.get_collection("albums_musicbrainz")
musicbrainz_artists = db.get_collection("artists_mb")
musicbrainz_artists_with_albums = db.get_collection("mb_artists_albums")

In [3]:
SPOTIPY_CLIENT_ID = 'd70e66cbc22e4e1287c50c97b54c2457'
SPOTIPY_CLIENT_SECRET = 'e20d58cae494486392723da75a3ef393'
SPOTIPY_REDIRECT_URI = 'http://localhost/?code=...'

credentials = oauth2.SpotifyClientCredentials(
client_id="d70e66cbc22e4e1287c50c97b54c2457",
client_secret="e20d58cae494486392723da75a3ef393")

token = credentials.get_access_token()
print(token)
spotify = spotipy.Spotify(auth=token)

BQBcHg2JXWGKyfTRXSuSAHqphGh7sfwtETsiQwRV0GCIMM0McmBrUBhem2ttVP5w1yHmm1h7KO1UNAekKvQ


## Find matches in Musicbrainz and Spotify

### Prepare collections with albums


In [12]:
#create new mongodb collection with albums grouped by artists

for artist in spotify_artists.find():
    artist_albums = spotify_albums.find_one({"_id": artist['_id']})
    artist["albums"] = artist_albums
    
    spotify_artist_with_albums.insert_one(artist)

In [15]:
# do the same with artist from another mongodb collection (Spotify with direct Musicbrainz ids)

for artist in spotify_artists_direct.find():
    artist_albums = spotify_albums.find_one({"_id": artist['_id']})
    artist["albums"] = artist_albums
    
    spotify_artist_with_albums.update_one({"_id": artist["_id"]}, {"$set": artist}, upsert=True)

  import sys


In [20]:
# musicbrainz --mongo document size is too big  - cannot add albums -> perform 'join' on the app-level

for artist in musicbrainz_artists.find():
    artist_albums = []
    for album in musicbrainz_albums.find({"artist_id": artist['_id']}):
        artist_albums.append(album)
    artist['albums'] = artist_albums
    musicbrainz_artists_with_albums.insert_one(artist)
    
    

DocumentTooLarge: BSON document too large (19586565 bytes) - the connected server supports BSON document sizes up to 16793598 bytes.

In [20]:
# try to download albums where value for "albums" is "null" - there might have been some error 
# because some of the artists does not have albums but they should
keys = ['album_group', 'album_type','artists', 'available-markets', 'id','name', 'release_date', 'release_date_precision', 'type']
i = 0

for artist in spotify_artist_with_albums.find({'$and': [{"albums": None}, {"popularity" : { "$gt": 0}}]}):
    i += 1
    clear_output(wait=True)
    print(i)
    fetched = spotify.artist_albums(artist["_id"], limit=50, offset = 0)
    
    items = fetched['items']
    albums = []
    for it in items:
        album = {k:v for k, v in it.items() if k in keys}
        albums.append(album)
        
    artist['albums'] = albums
    #print(albums)
    spotify_artist_with_albums.update_one({"_id": artist["_id"]}, {"$set": artist}, upsert=True)
    

In [9]:
# try to again download genres (due to error in Spotify API) - not sure if everything have been correctly fetched

for artist in spotify_artist_with_albums.find({'$and': [{"genres": {"$size": 0}}, {"popularity" : { "$gt": 0}}]})[1000:]:

    spotify_artist = spotify.artist(artist['_id'])
    if len(spotify_artist['genres']) > 0:
        artist['genres'] = spotify_artist['genres']
        print(artist['genres'])
        spotify_artist_with_albums.update_one({"_id": artist["_id"]}, {"$set": artist}, upsert=True)


CursorNotFound: cursor id 60522759289 not found

### Add musicbrainz ids to spotify artists for artists with direct match


In [8]:
import editdistance as ed
import helpers

max_ed_artist = 5
max_ed_album = 5
number_of_albums = 1

In [None]:
confilicts_nb = 0
#first add  mb ids to the artists with direct ids from mb (should be added earlier)
#there was a mistake in previous step while downloading data - only mb_name was saved and it is not unique
for artist in spotify_artist_with_albums.find({"mb_name": {"$exists": True}}):
    res = musicbrainz_artists.find({"name": artist['mb_name']})
    if res.count() > 1:
        print(artist)
        for r in res:
            print(artist['mb_name'], r)
        confilicts_nb += 1
        break
    
print(confilicts_nb)

In [37]:
#repair the mistake - search musicbrainz artist again and add mb_id to spotify artists
for artist in musicbrainz_artists.find({'urls': {'$exists': True}}):
    for i in range(0, len(artist['urls'])):
        if 'spotify' in artist['urls'][i]:
            spotify_id = helpers.get_id(artist['urls'][i])[:22]
            spotify_artist = spotify_artist_with_albums.find_one({'_id': spotify_id})
            if spotify_artist != None:
                spotify_artist['mb_id'] = artist['_id']
                spotify_artist_with_albums.update_one({"_id": spotify_artist["_id"]}, {"$set": spotify_artist}, upsert=True)
            #spotify_collection.update_one({'_id':spotify_artist['_id']}, {"$set": spotify_artist}, upsert=True)
    

The number below is the number of Spotify artists for whom matching Musicbrainz artist has not been found yet.

In [39]:
spotify_artist_with_albums.find({"mb_id": {"$exists": False}}).count()

1015875

### Find Musicbrainz IDs for the rest of artists (withoud direct mapping)

In [46]:
skipped = 0
with_one_entry = 0


for artist_spotify in spotify_artist_with_albums.find({"mb_id": {"$exists": False}}):
    mb_infos = []
    #print(artist_spotify)
    if artist_spotify.get('mb_info', None) != None:
         for info in artist_spotify['mb_info']:
                ed_dist = ed.eval(artist_spotify['name'], info['mb_name'])
                if  ed_dist <= max_ed_artist:
                    info['artist_distance'] = ed_dist
                    mb_infos.append(info)
    else:
        continue
        
        
    #find albums for filtered artists
    spotify_album_names = []
    sp_albums = artist_spotify.get('albums', None)
    if sp_albums != None:
    
        if type(sp_albums) is dict:
            sp_albums = sp_albums.get('albums', None)
            if sp_albums == None:
                continue
            sp_albums = sp_albums[0]

        for album in sp_albums:        
            spotify_album_names.append(album['name'])
   
    for info in mb_infos:
        mb_album_names = []
        similar_albumes = 0
        for album in musicbrainz_albums.find({'artist_id': info['mb_id']}):
            mb_album_names.append(album['name'])
        for mb_album in mb_album_names:
            for spotify_album in spotify_album_names:
                edist = ed.eval(mb_album, spotify_album)
                if edist <= max_ed_album:
                    similar_albumes +=1
        info['similar_albumes'] = similar_albumes
    #print(artist_spotify['name'], mb_infos)

    #if there are no entries in mb_infos - skip this artist
    if len(mb_infos) == 0:
        skipped +=1
        continue
    
    #if there is only one entry in mb_infos - add this mb_id
    if len(mb_infos) == 1:
        with_one_entry +=1
        artist_spotify['mb_id'] = mb_infos[0]['mb_id']
        spotify_artist_with_albums.update_one({"_id": artist_spotify["_id"]}, {"$set": artist_spotify}, upsert=True)
        continue
        
    mb_id = ''
    #most important is similarity in albums so take the mb_id from artist with highest number of similar albums
    max_ind = 0
    curr_val = 0
    for i in range(0, len(mb_infos)):
        if mb_infos[i]['similar_albumes'] > curr_val:
            max_ind = i
            curr_val = mb_infos[i]['similar_albumes']
                
    #it may happen that the highest value was 0 - then find artist with lowest artist_distance
    if curr_val == 0:
        min_ind = 0
        min_val = max_ed_artist
        for i in range(0, len(mb_infos)):
            if mb_infos[i]['artist_distance'] < min_val:
                min_ind = i
                min_val = mb_infos[i]['artist_distance']
        mb_id = mb_infos[min_ind]['mb_id']
    else:
        mb_id = mb_infos[max_ind]['mb_id']
    
    artist_spotify['mb_id'] = mb_id
    spotify_artist_with_albums.update_one({"_id": artist_spotify["_id"]}, {"$set": artist_spotify}, upsert=True)
        

## Download popular songs for artists

For each artist download top songs according to Spotify. This ranking is based on the popularity of the song and newer songs are more likely to be on top of that list. Number of downloaded songs is not constant for each artist.

In [6]:
spotify_songs = db.get_collection('spotify_songs')
#spotify_artist_with_albums.find({'$and': [{'mb_id': {'$exists': True}}, {'popularity': {'$gt': 0}}]}).count()

In [12]:
not_fetched = []

In [18]:


for artist in spotify_artist_with_albums.find({'$and': [{'mb_id': {'$exists': True}}, {'popularity': {'$gt': 0}}]}, {'_id':1})[375395:]:
    tracks = []
    songs = {}
    
    try:
        songs = spotify.artist_top_tracks(artist['_id'])
    except spotipy.client.SpotifyException as err:
        if err.code == 401:
            print("Spotipy 401 error:", err.msg)
            not_fetched.append(artist['_id'])
            time.sleep(60)
            continue
        else:
            print("Spotipy error:", err.code, err.http_status, err.msg)
            token = credentials.get_access_token()
            print(token)
            not_fetched.append(artist['_id'])
            spotify = spotipy.Spotify(auth=token)
            continue
    except Exception as err:
        print("Exception {1}", err)
        not_fetched.append(artist['_id'])
        time.sleep(20)
        continue
    
    for t in songs['tracks']:
        track = {}
        track['id'] = t['id']
        track['name'] = t['name']
        track['album_id'] = t['album']['id']
        tracks.append(track)
    
    artist_with_songs = {}
    artist_with_songs['_id'] = artist['_id']
    artist_with_songs['tracks'] = tracks
    
    spotify_songs.update_one({"_id": artist_with_songs["_id"]}, {"$set": artist_with_songs}, upsert=True)
    

Spotipy error: -1 401 https://api.spotify.com/v1/artists/4Bpdg6qEaZGeBQupDLnLkV/top-tracks?country=US:
 The access token expired
BQAy4L18YJ322IlBUxbviZMAYFtTMlYkqZ5UUQ_Z_cpoSRnxgj-xDPmHNMdzi4o2qYeugVjWMdXCr-ymuZo


In [19]:
print(len(not_fetched))

26


In [20]:
nf = []
for aid in not_fetched:
    tracks = []
    songs = {}
    
    try:
        songs = spotify.artist_top_tracks(aid)
    except spotipy.client.SpotifyException as err:
        if err.code == 401:
            print("Spotipy 401 error:", err.msg)
            nf.append(artist['_id'])
            time.sleep(60)
            continue
        else:
            print("Spotipy error:", err.code, err.http_status, err.msg)
            token = credentials.get_access_token()
            print(token)
            nf.append(aid)
            spotify = spotipy.Spotify(auth=token)
            continue
    except Exception as err:
        print("Exception {1}", err)
        nf.append(aid)
        time.sleep(20)
        continue
    
    for t in songs['tracks']:
        track = {}
        track['id'] = t['id']
        track['name'] = t['name']
        track['album_id'] = t['album']['id']
        tracks.append(track)
    
    artist_with_songs = {}
    artist_with_songs['_id'] = aid
    artist_with_songs['tracks'] = tracks
    
    spotify_songs.update_one({"_id": artist_with_songs["_id"]}, {"$set": artist_with_songs}, upsert=True)
    
    

In [6]:
all_songs = []
for artist in spotify_songs.find():
    for track in artist['tracks']:
        all_songs.append(track['id'])

In [8]:
print(all_songs[:10])
print(len(all_songs))

['1yh793k8lDYfXv3DhHaIXC', '0TCt7OFRdD8PQ6vTRQxNgQ', '0d2yIwPD2dLNFqrOye5qN2', '2xB0yPkGvwuPXiaOVHYnwX', '74VcKerwPq5WSOkXSCZ6vi', '6sJrooys9qvVVn1boqSxil', '2zHBwNDwNRO10HlLvFV7ik', '7MW95avtOxEPirfBY3aA6Y', '4ncCavpXH5dKwBebPRZJWq', '5opPmdS9Et0QFxRku2a9ln']
3132728


In [5]:
err_ind = []
spotify_songs_features = db.get_collection('spotify_songs_features')

In [15]:
for i in range(0, len(all_songs), 50):
    try:
        songs = spotify.audio_features(all_songs[i:i+50])
        spotify_songs_features.insert_many(songs)
    except spotipy.client.SpotifyException as err:
        if err.code == 401:
            print("Spotipy 401 error:", err.msg)
            err_ind.append(i)
            time.sleep(60)
            continue
        else:
            print("Spotipy error:", err.code, err.http_status, err.msg)
            token = credentials.get_access_token()
            print(token)
            err_ind.append(i)
            spotify = spotipy.Spotify(auth=token)
            continue
    except Exception as err:
        print("Exception {1}", err)
        err_ind.append(i)
        time.sleep(20)
        continue
    

Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.Ra

BQCHIFCa5x113MRKw0WpAWv69urV5qOVnlD1Z7bFytwda1xYeCvaR439jsy85WU5e99G-_fg07zRtpF0MCE
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMappin

Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.Ra

BQD-raEEjL_W4wLilJpXPWQFHErcRNI2KrDJ0Lzx2_ocEVbdNBgbrBeTorNSWbs75DQ1MmqmnXrFgN-b5Ac
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMappin

Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception {1} document must be an instance of dict, bson.son.SON, bson.raw_bson.Ra

In [16]:
#find all ids that were not found yet


213


In [24]:
all_ids = []
fetched_ids = []


In [25]:
for song in spotify_songs.find():
    for track in song['tracks']:
        all_ids.append(track['id'])

for song in spotify_songs_features.find():
    fetched_ids.append(song['id'])

In [26]:
print(len(all_ids), len(fetched_ids))

3132728 3131380


In [27]:
not_fetched = list(set(all_ids)-set(fetched_ids))
print(len(not_fetched))

280


In [29]:
last_songs = []
for song_id in not_fetched:
    try:
        song = spotify.audio_features(song_id)
        spotify_songs_features.insert_one(song[0])
    except spotipy.client.SpotifyException as err:
        if err.code == 401:
            print("Spotipy 401 error:", err.msg)
            last_songs.append(song_id)
            time.sleep(60)
            continue
        else:
            print("Spotipy error:", err.code, err.http_status, err.msg)
            token = credentials.get_access_token()
            print(token)
            last_songs.append(song_id)
            spotify = spotipy.Spotify(auth=token)
            continue
    except Exception as err:
        print("Exception: ", err)
        last_songs.append(song_id)
        #time.sleep(20)
        continue
    

Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument,

Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument,

Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument,

Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument,

Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument, or a type that inherits from collections.MutableMapping
Exception:  document must be an instance of dict, bson.son.SON, bson.raw_bson.RawBSONDocument,

## Finding and transforming lyrics
 Lyrics downloaded from musiXmatch (https://labrosa.ee.columbia.edu/millionsong/musixmatch)
 

In [35]:
import pandas as pd
import numpy as np

In [95]:
#load file with all artist for which we have at least one lyric
column_names = ['tid','artist_name','title','tid_mxm','artist_name_mxm','title_mxm' ]
mxm_matches = pd.read_csv("mxm_779k_matches.txt", sep="<SEP>", skiprows=18, engine='python', names=column_names)
mxm_matches = mxm_matches.drop(columns=["artist_name_mxm", "title_mxm"])

In [97]:
mxm_matches.head(5)

Unnamed: 0,tid,artist_name,title,tid_mxm
0,TRMMMKD128F425225D,Karkkiautomaatti,Tanssi vaan,4418550
1,TRMMMRX128F93187D9,Hudson Mohawke,No One Could Ever,8898149
2,TRMMMCH128F425532C,Yerba Brava,Si Vos Querés,9239868
3,TRMMMXN128F42936A5,David Montgomery,"Symphony No. 1 G minor ""Sinfonie Serieuse""/All...",5346741
4,TRMMMBB12903CB7D21,Kris Kross,2 Da Beat Ch'yall,2511405


In [107]:
len(mxm_matches)

779056

In [82]:
#load two datasets with lyrics and merge them into one set
row_list = []
with open("mxm_dataset_train.txt") as f:
    for i in range(18):
        f.readline()
    for line in f:
        splitted_line = line.split(",", 2)
        row_list.append(splitted_line)        

        
lyrics_train = pd.DataFrame(row_list,columns=['tid', 'tid_mxm', 'words'])



In [83]:
row_list_test = []
with open("mxm_dataset_test.txt") as f:
    for i in range(18):
        f.readline()
    for line in f:
        splitted_line = line.split(",", 2)
        row_list_test.append(splitted_line)        

        
lyrics_test = pd.DataFrame(row_list_test,columns=['tid', 'tid_mxm', 'words'])

In [84]:
lyrics = lyrics_test.append(lyrics_train)

In [85]:
lyrics.head()

Unnamed: 0,tid,tid_mxm,words
0,TRAABRX12903CC4816,1548880,"2:19,4:7,5:6,10:1,12:13,13:6,17:4,18:6,22:1,23..."
1,TRAADFO128F92E1E91,5325944,"1:79,2:66,3:15,4:7,5:8,6:9,7:5,8:5,9:4,10:57,1..."
2,TRAADQW128F427CE68,3811449,"1:3,2:3,3:2,7:4,8:1,9:3,11:1,12:1,59:1,131:2,1..."
3,TRAADRX12903D0EFE8,5583484,"1:1,6:5,7:1,10:1,41:1,47:1,102:1,112:1,128:3,1..."
4,TRAAEJQ128F92C484E,9124657,"1:28,2:7,3:12,4:3,5:4,6:3,7:1,8:11,9:3,10:1,11..."


In [106]:
len(lyrics)

237662

In [88]:
#load artist list with artists musicbrainz ids (downloaded from million song dataset website)
artists = pd.read_csv("unique_artists.txt",names=["artist_id", "mb_id", "tid", "artist_name"])
artists = artists.drop(columns=["tid"])

In [89]:
artists.head()

Unnamed: 0,artist_id,mb_id,artist_name
0,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols
1,AR003FB1187B994355,1dbd2d7b-64c8-46aa-9f47-ff589096d672,The Feds
2,AR006821187FB5192B,94fc1228-7032-4fe6-a485-e122e5fbee65,Stephen Varcoe/Choir of King's College_ Cambri...
3,AR009211187B989185,9dfe78a6-6d91-454e-9b95-9d7722cbc476,Carroll Thompson
4,AR009SZ1187B9A73F4,8cd574c0-b9f7-4998-94f4-654dffaecdf2,Gorodisch


In [98]:
#merge tables with artist and songs, the add lyrics - merge by artist_name since this value in musiXmatch was 
#taken directly from "1 million songs dataset", where Musicbrainz IDs are available for each artist

artist_matches = artists.merge(mxm_matches, how="left", on="artist_name")


In [100]:
artist_matches.head(5)

Unnamed: 0,artist_id,mb_id,artist_name,tid,title,tid_mxm
0,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRMUOZE12903CDF721,A Picture Of You,9407789.0
1,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRGXEWJ128F92EBFDE,For You,9099120.0
2,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRHVHHP12903CDF72E,Gonna Cry,9407794.0
3,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRHXTRF128F92EBFE8,He'll Never Come Back,9099126.0
4,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRCJERN12903CDF727,Another Boy,9407791.0


In [104]:
len(artist_matches)

752928

In [101]:
artists_songs = artist_matches.merge(lyrics, how="left", on="tid")

In [103]:
artists_songs.head(50)

Unnamed: 0,artist_id,mb_id,artist_name,tid,title,tid_mxm_x,tid_mxm_y,words
0,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRMUOZE12903CDF721,A Picture Of You,9407789.0,,
1,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRGXEWJ128F92EBFDE,For You,9099120.0,,
2,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRHVHHP12903CDF72E,Gonna Cry,9407794.0,,
3,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRHXTRF128F92EBFE8,He'll Never Come Back,9099126.0,,
4,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRCJERN12903CDF727,Another Boy,9407791.0,,
5,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRCKKEO128F92EBFE5,Baby Gene,9099125.0,,
6,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRRIYNQ128F92EBFF5,Baby You're Phrasing is Bad,9099130.0,,
7,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRQAUBZ12903CDF733,The Beating Of My Heart,9407796.0,,
8,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRQOCHU128F92EBFE4,True True Lovin',9099124.0,,
9,AR002UA1187B9A637D,7752a11c-9d8b-4220-ac44-e4a04cc8471d,The Bristols,TRIGHNJ12903CDF722,Baby I Got News For You,9407790.0,,


In [110]:
artists_songs = artists_songs.dropna(how="any")

In [111]:
artists_songs.to_csv("songs_lyrics.csv")

## Download songs features
To have even more data, songs' features were also downloaded. Those 'features' are defined by Spotify and they characterize the music. Among them there are such features as danceability, instrumentalness, energy etc.

In [4]:
songs = db.get_collection("spotify_songs")
songs_features = db.get_collection("spotify_songs_features")

In [5]:
artist_song = []
for artist in songs.find():
    for track in artist['tracks']:       
        artist_song.append([artist['_id'], track['id']])

In [6]:
artist_song_df = pd.DataFrame(artist_song, columns = ['artist_id', 'song_id'])

In [7]:
artist_song_df.head(20)

Unnamed: 0,artist_id,song_id
0,3XSyTI9ct70ZheMESAv2st,1yh793k8lDYfXv3DhHaIXC
1,3XSyTI9ct70ZheMESAv2st,0TCt7OFRdD8PQ6vTRQxNgQ
2,3XSyTI9ct70ZheMESAv2st,0d2yIwPD2dLNFqrOye5qN2
3,3XSyTI9ct70ZheMESAv2st,2xB0yPkGvwuPXiaOVHYnwX
4,3XSyTI9ct70ZheMESAv2st,74VcKerwPq5WSOkXSCZ6vi
5,3XSyTI9ct70ZheMESAv2st,6sJrooys9qvVVn1boqSxil
6,3XSyTI9ct70ZheMESAv2st,2zHBwNDwNRO10HlLvFV7ik
7,3XSyTI9ct70ZheMESAv2st,7MW95avtOxEPirfBY3aA6Y
8,3XSyTI9ct70ZheMESAv2st,4ncCavpXH5dKwBebPRZJWq
9,3XSyTI9ct70ZheMESAv2st,5opPmdS9Et0QFxRku2a9ln


In [25]:
columns = ['song_id', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
          'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

In [8]:
songs_table = []
for song in songs_features.find():
    item = [song['id'], song['danceability'], song['energy'], song['key'], song['loudness'],
           song['mode'], song['speechiness'], song['acousticness'], song['instrumentalness'],
           song['liveness'], song['valence'], song['tempo'], song['duration_ms']]
    songs_table.append(item)
    

AssertionError: 12 columns passed, passed data had 13 columns

In [26]:
song_features_fd = pd.DataFrame(songs_table, columns = columns)

In [27]:
artist_song_df = artist_song_df.merge(song_features_fd, how="left", on="song_id")

In [31]:
artist_song_df = artist_song_df.drop_duplicates(subset=['song_id'])

In [33]:
artist_song_df.head()

Unnamed: 0,artist_id,song_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,3XSyTI9ct70ZheMESAv2st,1yh793k8lDYfXv3DhHaIXC,0.236,0.162,5.0,-14.536,1.0,0.0305,0.842,2e-06,0.0739,0.23,91.43,222120.0
2,3XSyTI9ct70ZheMESAv2st,0TCt7OFRdD8PQ6vTRQxNgQ,0.625,0.567,4.0,-7.385,1.0,0.104,0.596,0.0,0.199,0.882,134.169,191413.0
6,3XSyTI9ct70ZheMESAv2st,0d2yIwPD2dLNFqrOye5qN2,0.529,0.477,8.0,-8.237,1.0,0.212,0.736,0.0,0.0434,0.452,123.5,170093.0
7,3XSyTI9ct70ZheMESAv2st,2xB0yPkGvwuPXiaOVHYnwX,0.438,0.364,2.0,-12.162,1.0,0.0496,0.715,0.0,0.107,0.792,166.378,179800.0
8,3XSyTI9ct70ZheMESAv2st,74VcKerwPq5WSOkXSCZ6vi,0.525,0.571,6.0,-8.935,0.0,0.155,0.607,0.0,0.224,0.932,171.027,105160.0


In [34]:
artist_song_df.to_csv("artist_song_features.csv")

## Create dataframe with summary for artist albums

This part is going to be useful for predicting career length of artist to define this length. In all cases, career length is the number of years between first and last album.

In [6]:
import pandas as pd
import numpy as np

In [7]:
artist_albumes = db.get_collection("spotify_artist_albums")

In [10]:
albumes_table = pd.DataFrame(columns=['artist_id','first_album','last_album','number_of_albums','titles'])
art = []
for artist in artist_albumes.find():
    artist_dict = {}
    titles = []
    dates = []
    
    sp_albums = artist.get('albums', None)
    if sp_albums != None:
    
        if type(sp_albums) is dict:
            sp_albums = sp_albums.get('albums', None)
            if sp_albums == None:
                continue
            sp_albums = sp_albums[0]
            if len(sp_albums) == 0:
                continue
            
            for album in sp_albums:
                #print(album)
                dates.append(int(album['release_date'][:4]))
                titles.append(album['name'])

            artist_dict['artist_id'] = artist['_id']
            artist_dict['first_album'] = np.array(dates).min()
            artist_dict['last_album'] = np.array(dates).max()
            artist_dict['number_of_albums'] = len(dates)
            artist_dict['titles'] = titles
            art.append(artist_dict)


In [14]:
albumes_table = albumes_table.append(art, ignore_index= True)

In [15]:
albumes_table.to_csv("data/albums_info.csv")

## Add geolocation data


 ### Download geolocation data   

In [8]:
location_collection = db.get_collection("artist_geolocation")

In [5]:
#from 1Msongs database

unique_artists = pd.read_csv('unique_artists.txt', sep=",", header=None)
unique_artists.columns = ["artist_id", "artist_mbid", "track", "name"]

artist_location = pd.read_csv('artist_location.txt', sep="\v", header=None)
artist_location.columns = ["artist_id", "latitude", "longtitude", "name","place"]


joined = pd.merge(unique_artists, artist_location, on='artist_id')
print(joined.shape)
print(unique_artists.shape)
print(artist_location.shape)

print(joined[:10])



(13850, 8)
(44745, 4)
(13850, 5)
            artist_id                           artist_mbid  \
0  AR003FB1187B994355  1dbd2d7b-64c8-46aa-9f47-ff589096d672   
1  AR00A6H1187FB5402A  312c14d9-7897-4608-944a-c5b1c76ae682   
2  AR00DG71187B9B7FCB  cbd80a34-d210-49f7-ac50-3bc81868cc50   
3  AR00FVC1187FB5BE3E  ecd6ec3e-18d3-45e3-91a0-06e7beeb5f0a   
4  AR00J9R1187B98D920  41da462e-e441-4bc0-952b-5a66387df5be   
5  AR00JIO1187B9A5A15  5251b5a0-3e3b-4d07-a152-585009575310   
6  AR00M9H1187B9B59CA  5560d4ec-1f1f-4ae5-8010-fc58ad431eef   
7  AR00MBZ1187B9B5DB1  ff748426-8873-4725-bdc7-c2b18b510d41   
8  AR00QZQ1187B98EBDB  9a470310-b4c4-42d9-b432-a780fff5ce9d   
9  AR00RAE1187FB41114  6954c529-a901-4d94-87cf-032653a0feec   

                track                          name_x  latitude  longtitude  \
0  TRWDPFR128F93594A6                        The Feds  39.10295   -94.58306   
1  TRWNEQX12903CB84FB                     The Meatmen  42.73383   -84.59334   
2  TRMRKVR128F4254367               

In [7]:
#add this data to mongodb
for idx, row in joined.iterrows():
    artist = {}
    artist['_id'] = row['artist_mbid']
    artist['name'] = row['name_x']
    artist['latitude'] = row['latitude']
    artist['longtitude'] = row['longtitude']
    artist['place'] = row['place']
    location_collection.update_one({"_id": artist["_id"]}, {"$set": artist}, upsert=True)

In [9]:
countries = pd.read_csv("Country_List_ISO_3166_Codes_Latitude_Longitude.csv")

In [10]:
countries

Unnamed: 0,Country,Alpha-2 code,Alpha-3 code,Numeric code,Latitude (average),Longitude (average),Icon
0,Albania,AL,ALB,8,41.0000,20.0000,
1,Algeria,DZ,DZA,12,28.0000,3.0000,
2,American Samoa,AS,ASM,16,-14.3333,-170.0000,
3,Andorra,AD,AND,20,42.5000,1.6000,
4,Angola,AO,AGO,24,-12.5000,18.5000,
5,Anguilla,AI,AIA,660,18.2500,-63.1667,
6,Antarctica,AQ,ATA,10,-90.0000,0.0000,
7,Antigua and Barbuda,AG,ATG,28,17.0500,-61.8000,
8,Argentina,AR,ARG,32,-34.0000,-64.0000,
9,Armenia,AM,ARM,51,40.0000,45.0000,


In [10]:
# from musicbrainz get country of origin, find its latitude and longtitude and add this data to the database
for mb_artist in musicbrainz_artists.find():
    #print(mb_artist)
    if type(mb_artist['area']) == str:
        continue
    if mb_artist['area'].get('iso_366_1') == None:
        continue
    #if location_collection.find({"_id": mb_artist['_id']}).count() > 0:
    #   continue
    try:
        artist = {}
        artist['_id'] = mb_artist['_id']
        #print(mb_artist['area'])
        artist['country'] = mb_artist['area']['iso_366_1']
        artist['latitude'] = countries[countries['Alpha-2 code']== mb_artist['area']['iso_366_1']]['Latitude (average)'].values[0]
        artist['longtitude'] = countries[countries['Alpha-2 code']== mb_artist['area']['iso_366_1']]['Longitude (average)'].values[0]
        location_collection.update_one({"_id": artist['_id']}, {"$set": artist}, upsert=True)
        #print(artist)
        
    except Exception as err:
        print(err)
        continue

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with

In [23]:
#save created collection as csv to be easily read with pandas

df = pd.DataFrame(list(location_collection.find()))

In [24]:
df.to_csv("data/artist_localization.csv",index=False)

In [17]:
#create dictionaries to translate between ISO-2 and ISO-3 country codes
country_codes1 = countries[['Alpha-2 code', 'Alpha-3 code']].set_index('Alpha-2 code')['Alpha-3 code'].to_dict()
country_codes2 = countries[['Alpha-2 code', 'Alpha-3 code']].set_index('Alpha-3 code')['Alpha-2 code'].to_dict()

In [19]:
country_codes ={**country_codes1, **country_codes2}

In [21]:
import pickle

In [22]:
pickle.dump(country_codes, open("geonames/country_codes.pkl", "wb"))

## Dataset preparation
Data downloading and merging process resulted in creation of few files in '.csv' format. They consist of different information but now they can be easily merged using 'spotify id'. The goal of this part is to create only one file with transformed features so that they would be useful for classification algorithms.
What still needs some preprocessing are music genres (one-hot encoding will be applied) and songs lyrics, which will be transformed from bag of words format to tf-idf matrices.

In [5]:
#load the data
main_dataset = pd.read_csv("dataset.csv")
songs_lyrics = pd.read_csv("songs_lyrics.csv")
songs_features = pd.read_csv("artist_song_features.csv")

#delete columns that are not needed
main_dataset = main_dataset.drop(columns=["genres", "Unnamed: 0", "name"])
songs_lyrics = songs_lyrics.drop(columns =["artist_name", "title","tid_mxm_x","tid_mxm_y", "artist_id", "tid", "Unnamed: 0"])
songs_features = songs_features.drop(columns = ["song_id","Unnamed: 0"])

In [6]:
main_dataset.head()

Unnamed: 0,_id,followers,mb_id,popularity,reduced_genres,gender,type,begin_date_year
0,3XSyTI9ct70ZheMESAv2st,3752.0,290e5513-55d3-426f-a4ba-1ac3ce589107,61.0,['broadway'],Male,Person,1980.0
1,2jYHSJBXjusgmYdrNeaRmg,1361.0,09cddf26-fe4a-493e-bb1d-64723892ed8d,29.0,['jazz'],Male,Person,1960.0
2,5LmehwqsJa7a4Ya5SaqXpx,1147.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,37.0,['doo-wop'],unknown,Group,1956.0
3,10BFTSAfLauhKVmdby4zac,0.0,3a8ecc10-f888-48c0-9674-5c0ccd1fe93f,0.0,[],unknown,Group,1965.0
4,4hwEAtCJZa1LMgbuRpUWJB,4.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,0.0,[],unknown,Group,1956.0


In [7]:
songs_lyrics.head()

Unnamed: 0,mb_id,words
0,7752a11c-9d8b-4220-ac44-e4a04cc8471d,"1:22,2:4,3:28,4:2,5:1,6:3,7:13,8:3,9:16,10:2,1..."
1,7752a11c-9d8b-4220-ac44-e4a04cc8471d,"1:9,2:3,3:8,4:5,5:9,6:4,7:3,8:5,9:10,11:2,12:2..."
2,312c14d9-7897-4608-944a-c5b1c76ae682,"2:1,3:3,4:1,5:1,6:1,7:1,10:1,11:1,12:3,13:1,14..."
3,312c14d9-7897-4608-944a-c5b1c76ae682,"1:3,2:1,3:3,4:2,6:4,7:1,8:1,9:5,10:1,14:1,15:6..."
4,312c14d9-7897-4608-944a-c5b1c76ae682,"1:4,2:27,4:7,5:5,6:5,7:2,8:6,9:1,10:4,11:2,12:..."


In [8]:
songs_features.head()

Unnamed: 0,artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,3XSyTI9ct70ZheMESAv2st,0.236,0.162,5.0,-14.536,1.0,0.0305,0.842,2e-06,0.0739,0.23,91.43,222120.0
1,3XSyTI9ct70ZheMESAv2st,0.625,0.567,4.0,-7.385,1.0,0.104,0.596,0.0,0.199,0.882,134.169,191413.0
2,3XSyTI9ct70ZheMESAv2st,0.529,0.477,8.0,-8.237,1.0,0.212,0.736,0.0,0.0434,0.452,123.5,170093.0
3,3XSyTI9ct70ZheMESAv2st,0.438,0.364,2.0,-12.162,1.0,0.0496,0.715,0.0,0.107,0.792,166.378,179800.0
4,3XSyTI9ct70ZheMESAv2st,0.525,0.571,6.0,-8.935,0.0,0.155,0.607,0.0,0.224,0.932,171.027,105160.0


### Create dataset with mean values of songs features and lyrics
Songs dataset consist of varying number of songs for each artist. In this approach songs features will be averaged for each artist. Similar situation is with songs' lyrics, and in this case word frequencies in lyrics will be averaged (in respect to artist) and then tfidf matrix will be computed.

In [10]:
#compute mean of songs features for each artis
mean_features = songs_features.groupby(['artist_id']).mean()

#merge main dataset with songs features by
artist_songs = main_dataset.merge(mean_features, how="left", left_on="_id", right_index=True)

#replace genders names by integer values
genders = {"Female":1, "Male":2, "unknown": 0}
artist_songs['gender'].replace(genders, inplace=True)

#check the number of artists with all information
len(artist_songs)

739298

In [11]:
artist_songs['age'] = 2018-artist_songs['begin_date_year']

artist_songs.head(5)

Unnamed: 0,_id,followers,mb_id,popularity,reduced_genres,gender,type,begin_date_year,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,age
0,3XSyTI9ct70ZheMESAv2st,3752.0,290e5513-55d3-426f-a4ba-1ac3ce589107,61.0,['broadway'],2,Person,1980.0,0.5459,0.4401,...,-9.4249,0.8,0.1402,0.7565,3.18e-07,0.1588,0.6575,125.3684,153180.0,38.0
1,2jYHSJBXjusgmYdrNeaRmg,1361.0,09cddf26-fe4a-493e-bb1d-64723892ed8d,29.0,['jazz'],2,Person,1960.0,0.6975,0.5541,...,-11.2578,0.4,0.04146,0.311739,0.71967,0.09388,0.6947,103.1691,324042.8,58.0
2,5LmehwqsJa7a4Ya5SaqXpx,1147.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,37.0,['doo-wop'],0,Group,1956.0,0.5071,0.55695,...,-10.1384,1.0,0.06511,0.7269,0.09358214,0.20124,0.8238,139.8446,141725.6,62.0
3,10BFTSAfLauhKVmdby4zac,0.0,3a8ecc10-f888-48c0-9674-5c0ccd1fe93f,0.0,[],0,Group,1965.0,,,...,,,,,,,,,,53.0
4,4hwEAtCJZa1LMgbuRpUWJB,4.0,ca3f3ee1-c4a7-4bac-a16a-0b888a396c6b,0.0,[],0,Group,1956.0,,,...,,,,,,,,,,62.0


In [12]:
#save dataframe with averaged songs features for each artist in .csv file
artist_songs.to_csv("data/artist_songs.csv", index = False)

In [None]:
#transform words string to vectors and then add them to create only one vector for each artist
def string_to_dict(str_dict):
    res = ast.literal_eval('{'+str_dict+'}')
    return res

def lyric_to_vector(lyric):
    dic_vec = string_to_dict(lyric.strip())
    return csr_matrix((np.array(list(dic_vec.values())), (np.zeros(len(dic_vec)), np.array(list(dic_vec.keys())) -1)), shape=(1,5000))

In [None]:
songs_vectors_d = songs_lyrics
songs_vectors_d['words'] = songs_lyrics['words'].apply(lyric_to_vector)

#sum words frequencies in lyrics and create lyrics matrix as sparse vectors
songs_vector_sum = songs_vectors_d.groupby("mb_id").agg(['sum', 'count'])
songs_vector_sum.columns=songs_vector_sum.columns.droplevel()
songs_vector_sum['mean'] = songs_vector_sum['sum']/songs_vector_sum['count']

lyrics_matrix = scipy.sparse.vstack(songs_vector_sum['mean'].values)

#transform a count matrix to a normalized tf-idf representation
vectorizer = TfidfTransformer(norm = 'l2', use_idf = True)
vectorizer.fit(lyrics_matrix)

lyrics_tfidf = vectorizer.transform(lyrics_matrix)

songs_vector_sum['tfidf'] = list(lyrics_tfidf)
artist_songs = artist_songs.merge(songs_vector_sum[['tfidf']], how='left', left_on='mb_id', right_index=True)

In [None]:
artist_songs.loc[0]['tfidf']

In [None]:
lyrics_tfidf = artist_songs['tfidf']

lyrics_tfidf = lyrics_tfidf.fillna(0)

In [None]:
for i, row in enumerate(lyrics_tfidf[:]):
    if type(row) == int:
        lyrics_tfidf[i] = csr_matrix(np.zeros((1,5000)))
    else:
        lyrics_tfidf[i] = row

lyrics_tfidf_sparse = scipy.sparse.vstack(lyrics_tfidf.values)

In [None]:
#save created sparse matrix
save_npz("data/lyrics_tfidf.npz", lyrics_tfidf_sparse)

### Create and save sparse vectors with music genres

Since music genres are now formatted as array of strings, one-hot encoding must be applied. First, a dictionary with all genres with assigned integer is created, then OHE are made by setting vectors elements to 1 on indexes that corresponds to artist's genres. This operation results in set of vectors with zero's on almost all positions. To save the storage space they are transformed into sparse matrix and saved on the disk for later use.

In [None]:
#create dictionary for genres to create one-hot vector for each artist
genres = []
for item in artist_songs.iterrows():
    genres.append(item[1]['reduced_genres'])

all_genres = []
for sublist in genres:  
    if type(sublist) != float:
        sublist = ast.literal_eval(sublist)
        for item in sublist:
            all_genres.append(item)
all_genres = list(set(all_genres))
print(len(all_genres))

genres_dict = {}
for i in range(0, len(all_genres)):
    genres_dict[all_genres[i]] = i

#save created dictionary
pickle.dump(genres_dict, open('data/genres_dict.pkl', 'wb'))

genres_dict = pickle.load(open("genres_dict.pkl", 'rb'))

In [None]:
def genres_string_to_vec(genres):
    genres_vector = np.zeros((len(genres_dict)))
    if type(genres) != str:
        return genres_vector
    genres_list = ast.literal_eval(genres)
    for genre in genres_list:
        genres_vector[genres_dict[genre]] = 1
    return csr_matrix(genres_vector)

genres_vectors = artist_songs['reduced_genres'].apply(genres_string_to_vec)

genres_sparse = scipy.sparse.vstack(genres_vectors.values)

save_npz("data/genres_sparse.npz", genres_sparse)

### Create dataset with one vector for each song
It means that one artist can have few vectors - each for one of his songs. Songs features are not averaged as in previous dataset.
Songs lyrics are a little bit problematic - not always there is direct matching between songs ids with both lyrics and songs' features. Two approaches can be tested: first would be to create lyrics vectors in the same way as before and then same vector will be added for each song of one artist, second approach is to use only vectors where both information is available.

In [None]:
#merge songs features with main dataset
songs = songs_features.merge(main_dataset, how="left", right_on="_id", left_on="artist_id")

songs.head()

#create genres sparse matrix for songs dataframe
songs_genres_vector = songs['reduced_genres'].apply(genres_string_to_vec)

songs_genres_sparse = scipy.sparse.vstack(songs_genres_vector.values)

save_npz("data/songs_genres_sparse.npz", songs_genres_sparse)

songs_genres_sparse.shape

#create sparse matrix 