# Data gathering

# Concatenate all CSVs

In [1]:
import pandas as pd

In [2]:
df_list = []
for decade in ('60', '70', '80', '90', '00', '10'):
    df_tmp = pd.read_csv(f'data/dataset-of-{decade}s.csv')
    df_tmp['decade'] = decade
    df_list.append(df_tmp)
df1 = pd.concat(df_list, ignore_index=True)

In [3]:
df1

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade
0,Jealous Kind Of Fella,Garland Green,spotify:track:1dtKN6wwlolkM8XZy2y9C1,0.417,0.620,3,-7.727,1,0.0403,0.4900,0.000000,0.0779,0.8450,185.655,173533,3,32.94975,9,1,60
1,Initials B.B.,Serge Gainsbourg,spotify:track:5hjsmSnUefdUqzsDogisiX,0.498,0.505,3,-12.475,1,0.0337,0.0180,0.107000,0.1760,0.7970,101.801,213613,4,48.82510,10,0,60
2,Melody Twist,Lord Melody,spotify:track:6uk8tI6pwxxdVTNlNOJeJh,0.657,0.649,5,-13.392,1,0.0380,0.8460,0.000004,0.1190,0.9080,115.940,223960,4,37.22663,12,0,60
3,Mi Bomba Sonó,Celia Cruz,spotify:track:7aNjMJ05FvUXACPWZ7yJmv,0.590,0.545,7,-12.058,0,0.1040,0.7060,0.024600,0.0610,0.9670,105.592,157907,4,24.75484,8,0,60
4,Uravu Solla,P. Susheela,spotify:track:1rQ0clvgkzWr001POOPJWx,0.515,0.765,11,-3.515,0,0.1240,0.8570,0.000872,0.2130,0.9060,114.617,245600,4,21.79874,14,0,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41101,Lotus Flowers,Yolta,spotify:track:4t1TljQWJ6ZuoSY67zVvBI,0.172,0.358,9,-14.430,1,0.0342,0.8860,0.966000,0.3140,0.0361,72.272,150857,4,24.30824,7,0,10
41102,Calling My Spirit,Kodak Black,spotify:track:2MShy1GSSgbmGUxADNIao5,0.910,0.366,1,-9.954,1,0.0941,0.0996,0.000000,0.2610,0.7400,119.985,152000,4,32.53856,8,1,10
41103,Teenage Dream,Katy Perry,spotify:track:55qBw1900pZKfXJ6Q9A2Lc,0.719,0.804,10,-4.581,1,0.0355,0.0132,0.000003,0.1390,0.6050,119.999,227760,4,20.73371,7,1,10
41104,Stormy Weather,Oscar Peterson,spotify:track:4o9npmYHrOF1rUxxTVH8h4,0.600,0.177,7,-16.070,1,0.0561,0.9890,0.868000,0.1490,0.5600,120.030,213387,4,21.65301,14,0,10


In [4]:
df1.to_csv('data/dataset1.csv', index=False)

## Get tags from lastfm API and add genre attribute to data

In [5]:
import os
import pylast
import re
import time
from concurrent.futures import ThreadPoolExecutor
from threading import Lock
from dotenv import load_dotenv
from tqdm import tqdm

In [6]:
df2 = pd.read_csv('data/dataset1.csv')

In [7]:
load_dotenv()
api_key = os.getenv('LASTFM_API_KEY')
api_secret = os.getenv('LASTFM_API_SECRET')
username = os.getenv('LASTFM_USERNAME')
password_hash = pylast.md5(os.getenv('LASTFM_PASSWORD'))

network = pylast.LastFMNetwork(
     api_key=api_key,
     api_secret=api_secret,
     username=username,
     password_hash=password_hash,
)

In [8]:
file_lock = Lock()

In [11]:
def get_genre(ind):
    author = df2.loc[ind, 'artist']
    song_name = df2.loc[ind, 'track']
    patterns=['rock', 'metal', 'country', '(.+[ -]+|)pop([ -]+.+|)', 'funk', 'hip-hop', 'jazz', 'blues', 'techno']
    real_genres=['rock', 'metal', 'country', 'pop', 'funk', 'hip-hop', 'jazz', 'blues', 'techno']
    track = network.get_track(author, song_name)
    tags = []
    while True:
        try:
            tags = track.get_top_tags()
            break
        except pylast.WSError as err:
            with file_lock:
                with open('error_log.txt', 'a') as f:
                    if 'Rate Limit Exceeded' in repr(err):
                        print(f'{ind}: Rate Limit Exceeded, retrying...', file=f)
                    else:
                        print(f'{ind}: {err}, skipping', file=f)
                        break
            time.sleep(10)  # Sleep for 10 seconds to reset the rate limit
    for tag in tags:
        for genre, pattern in zip(real_genres,patterns):
            if re.fullmatch(pattern,str(tag.item).lower()):
                return genre

In [12]:
open('error_log.txt', 'w').close()  # clear error_log.txt
found_genres = []
with ThreadPoolExecutor(max_workers=5) as executor:
    found_genres = list(tqdm(executor.map(get_genre, df2.index), total=len(df2.index)))

100%|████████████████████████████████████████████████████████████████████████████| 41106/41106 [35:40<00:00, 19.20it/s]


In [14]:
df2['genre'] = found_genres

In [16]:
df2['genre'].describe()

count     20307
unique        9
top         pop
freq       7181
Name: genre, dtype: object

In [19]:
df2.to_csv('data/dataset2.csv', index=False)