# AcousticBrainz feature 
By Victor Badenas

### Plotting and analysis of the most instantiated generes' features present in AllMusic, Discogs, LastFM and TagTraum

First the environment is set:

In [1]:
import csv
import os
import pandas as pd
from collections import defaultdict
import sys
import numpy as np
import matplotlib.pyplot as plt
import json
import time
from IPython.display import clear_output

IMAGE_FOLDER = os.path.join(os.path.abspath(""),"Output Plots")
DATA_FOLDER = os.path.join(os.path.abspath(""), "Data Files")

if not os.path.isdir(IMAGE_FOLDER): os.makedirs(IMAGE_FOLDER)
if not os.path.isdir(DATA_FOLDER): os.makedirs(DATA_FOLDER)

# Part 1: Computing the Intersection

### Load all four datasets

Load them row by row using a reder object as we are only interested in the sound_id part and store the sound_id in a set:

In [2]:
allmusic_ids = set([])
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-allmusic-train.tsv")
with open(file) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for sound in reader:
        allmusic_ids.add(sound[0])
    allmusic_ids.discard("recordingmbid")

KeyboardInterrupt: 

In [None]:
discogs_ids = set([])
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-discogs-train.tsv") 
with open(file) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for sound in reader:
        discogs_ids.add(sound[0])
    discogs_ids.discard("recordingmbid")

In [None]:
lastfm_ids = set([])
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train.tsv") 
with open(file) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for sound in reader:
        lastfm_ids.add(sound[0])
    lastfm_ids.discard("recordingmbid")

In [None]:
tagtraum_ids = set([])
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-tagtraum-train.tsv") 
with open(file) as tsvfile:
    reader = csv.reader(tsvfile, delimiter='\t')
    for sound in reader:
        tagtraum_ids.add(sound[0])
    tagtraum_ids.discard("recordingmbid")

Compute the intersection for the ids of all four datasets

In [None]:
intersection_ids = allmusic_ids & discogs_ids & lastfm_ids & tagtraum_ids

# Print Part 1

In [None]:
print(len(intersection_ids))
intersection_ids

# Part 2: Get only one database and get the intersection info

### Load the LastFM dataset

Loading it as a pandas matrix and then compute the difference between the whole dataset and the intersection so that we are left only with the information of the intersection_ids

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train.tsv") 
with open(file) as tsvfile:
    lastfm_sounds = pd.read_csv(file,sep='\t',index_col=0,low_memory=False)

remove the items in the dataframe that are not in the intersection set. 

In [None]:
diff = set(lastfm_sounds.index.tolist())-intersection_ids
lastfm_sounds = lastfm_sounds.drop(diff)

# Print Part 2

In [None]:
print(lastfm_sounds.shape)
lastfm_sounds

Save the intersection pandas dataframe to a tsv for the lines above to be only computed once

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train-intersection.tsv")
with open(file,"w") as tsvfile:
    lastfm_sounds.to_csv(tsvfile,sep = '\t')

### Load if the intersection has already been computed and stored

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train-intersection.tsv")
with open(file) as tsvfile:
    lastfm_sounds = pd.read_csv(tsvfile,sep = '\t',index_col=0,low_memory=False)

# Part 3: Reducing the dataset to the most occurrent

### Shortening the pandas matrix to the sounds with most frequent generes

First we do a counter for the generes and store it in a default_dictionary:

In [None]:
#get names of the columns (genre1, genre2, etc...)
col_names = list(lastfm_sounds)

#declare counter default dictionary
genere_counter = defaultdict(int)

#iterate through genre columns
for col in col_names[1:]:
    
    #for each column, get the list of that column's values for all sound_ids
    col_values = lastfm_sounds[col].tolist()
    
    #iterate in that list
    for genere in col_values:
        
        #if the genre has a subgenre, it will be of the format genre---subgenre, if it does not have a subgenre, ignore
        try:
            subgenere = genere.split("---")[1]
        except(AttributeError,IndexError) as e:
            subgenere = ''
        if subgenere != '' : genere_counter[genere] += 1

Then the N most frequent subgeneres are gotten and converted it to a list of genre---subgenre 

In [None]:
N = 20 #how many genres to consider

#create dataframe for the default dictionary containing {Genere:Times it is mentioned}
generespd = pd.DataFrame(list(genere_counter.items()))

#change dataframe's column and row values
generespd.columns = ["Genere","Count"]
generespd = generespd.set_index("Genere")

#get only the N largest in the Count column
generespd = generespd.nlargest(N, "Count")

#convert the index to list in order to compare further on
most_frequent = generespd.index.tolist()

Get only the sounds that have the most_frequent subgenres in the generes matrix and delete the generes that are not on the list even if the sound has one of the most_frequent subgeneres, as the subgeneres that are not in the list are not relevant for the exercise:

In [None]:
#make a copy of lastfm_sounds as pandas works by reference by default
lastfm_sounds_intersected_N = lastfm_sounds.copy() 

#iterate through the columns
for col in lastfm_sounds:
    
    #check for all the genres in most frequent and replace anything that is not there with a Nan
    lastfm_sounds_intersected_N[col] = lastfm_sounds[col].str.extract(r"\b^("+"|".join(most_frequent)+r")\b")
    
#Delete all rows and columns filled exclusively with Nan
lastfm_sounds_intersected_N = lastfm_sounds_intersected_N.dropna(how='all').dropna(axis='columns',how='all')

# Print Part 3

In [None]:
print(lastfm_sounds_intersected_N.shape)
lastfm_sounds_intersected_N.sort_index()

### Store the values of the matrix that has the N most frequent generes

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train-intersection-20.tsv")
with open(file,"w") as tsvfile:
    lastfm_sounds_intersected_N.to_csv(tsvfile,sep = '\t')

### Open the file if the code above has already been run

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-lastfm-train-intersection-20.tsv")
with open(file) as tsvfile:
    lastfm_sounds_intersected_N = pd.read_csv(tsvfile,sep = '\t',index_col=0,low_memory=False)

# Part 4: Prepare the features' data to be plotted 

### Open csv Files with the Features

After this, load the csv file with the features ...

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval2017-train-amplab2019-selected-features-mbid.csv")
with open(file) as csvfile:
    selected_features = pd.read_csv(csvfile,index_col = 0, low_memory = False)
selected_features.shape

... and compute the difference as it is done in the LoadFM dataset part, to drop the files that are not in the list of the sounds containing the N most frequent subgeneres:

In [None]:
diff_features = set(selected_features.index.tolist())-set(lastfm_sounds_intersected_N.index.tolist())
selected_features = selected_features.drop(diff_features)

# Print Part4

In [None]:
print(selected_features.shape)
selected_features.sort_index().head()

# Part 5: Plot of the features by genre 

### Computation of the features in a dictionary of genres

In [None]:
#initialize dictionary
information = {}

#loop through subgenres to create an entry in the dictionary for each subgenre where in each entry, a dictionary of
#the features will be created in order to get a dictionary of features for each genre in an organised manner.
for i,sub_genre in enumerate(most_frequent):
    
    print("Processing {0} out of {1}".format(i+1,len(most_frequent)))
    
    #temporal variable that copies the lastfm intersected N pandas matrix
    lastfm_sounds_intersected_N_temp = lastfm_sounds_intersected_N.copy()

    #iterate through columns for:
    #- All genres that are not the one in each iteration will be repaced with NaN
    #- All columns and sound ids that are not from that genere get deleted
    for col in lastfm_sounds_intersected_N_temp:
        
        lastfm_sounds_intersected_N_temp[col] = lastfm_sounds_intersected_N_temp[col].str.extract(r"\b^("+sub_genre+r")\b")

    lastfm_sounds_intersected_N_temp = lastfm_sounds_intersected_N_temp.dropna(how='all').dropna(how='all',axis='columns')
    
    #initializing the features dictionary for each subgenre
    information[sub_genre] = {}
    
    #for each feature, add the list features for each subgenre to the dictionary
    for feature in list(selected_features):
        
        #get only the feature that it's wanted
        temp = selected_features[feature].to_frame()
        
        #get only the information for the genre for this iteration
        temp = temp.drop(set(temp.index.tolist())-set(lastfm_sounds_intersected_N_temp.index.tolist()))
        
        #add entry to the dictionary
        information[sub_genre].update({ feature : temp[feature].tolist() })
    
    clear_output()

print("Done!")

### Plot of the features distribution

Once the dictionary containing all features for all subgenres as a 3D matrix or dictionary (subgenre,feature,data) the plot is computed from this hirearchy.

First, functions for plotting the barplot from a dictionary that contains the number of times a discrete feature ocurrs is done.

In [None]:
def plotbar(feature_name,counter,percentage,directory):
    
    file = os.path.join(directory,"{}.{}".format(feature_name,'png'))
    
    #initialize plot and axes objects
    fig, ax = plt.subplots()
    opacity = 0.8
    
    #get the strings for the features that will be on the legend.
    indexes_str = list(counter[list(counter.keys())[0]].keys())
    
    #get the genre strings and ints(for multibar plot purposes)
    genre_index_int = np.arange(len(list(counter.keys())))
    genre_index_str = list(counter.keys())
    
    #set the value of the bar width according to the number of bar that have to be plotted in each x value
    bar_width = 1/(1.25*len(indexes_str))
    
    #for multibar plot purposes
    i = 0
    
    #calculate the sum of values for each subgenre in order to normalize if desired
    norm_array = [] 
    for sub_genre in genre_index_str:
        norm = 0
        for index in indexes_str:
            norm += counter[sub_genre][index]
        norm_array.append(norm)
    
    #for each value of the legend:
    for index in indexes_str:
        
        #list of values to print
        val = []
        
        #for each value in x axis
        for sub_genre in counter.keys():
            
            #add the corresponding value to the list
            val.append(counter[sub_genre][index])
            
        if percentage:
            val[:] = [100*x/norm for x,norm in zip(val,norm_array)]
        
        #print the rectangle
        rects1 = plt.bar(genre_index_int + i*bar_width, val, bar_width, alpha=opacity, label=index)
        
        #for multibar plot purposes
        i += 1
        

    plt.xlabel('Genre')
    if percentage:
        plt.ylabel('Percentage')
    else:
        plt.ylabel('Appearances')
    plt.title(feature_name)
    plt.xticks(genre_index_int + i/2*bar_width, genre_index_str )
    plt.setp(ax.xaxis.get_majorticklabels(),rotation=45,ha="right")
    plt.legend()
    fig.set_size_inches(15,10)
    plt.savefig(file, dpi=100)
    plt.show()

Define a function for the boxplot given a list of arrays of data and a list of labels, do an horizontal plot

In [None]:
def plotbox(feature_name,list_data,list_names,directory):
    
    file = os.path.join(directory,"{}.{}".format(feature_name,'png'))
    
    #initalize figure and axis
    fig, ax = plt.subplots()
    
    #set title
    ax.set_title(feature_name)
    
    #plot the data with the list given horizontally
    ax.boxplot(list_data,labels=list_names,vert=False)
    
    fig.set_size_inches(15,10)
    plt.savefig(file, dpi=100)
    plt.show()

Do one plot or the other depending on the feature to plot. 

It is also saved a png image of the plot to an output image folder.

In [None]:
#loop though the features
for i,feature in enumerate(list(selected_features)):
    
    print("Processing {0} out of {1}".format(i+1,len(most_frequent)))
    
    #if the feature selected is the tonal key or the tonal scale:
    #- loop for each subgenre
    #- count the times a feature is repeated for each genre
    #- call plotbar function
    if (feature == 'tonal.key_key') | (feature == 'tonal.key_scale'):
        counter = {}
        for sub_genre in most_frequent:
            counter[sub_genre] = defaultdict(int)
            str_list = information[sub_genre][feature]
            for item in str_list:
                counter[sub_genre][item] += 1
        plotbar( feature_name = feature, counter = counter,percentage = True, directory = IMAGE_FOLDER)
    
    #if the feature is any other:
    #- get the data and store it in a list of arrays of data synced with the most_frequent features labels
    #- call plotbox function
    else:
        list_data = []
        list_names = most_frequent
        for sub_genre in most_frequent:
            list_data.append(information[sub_genre][feature])
        
        plotbox( feature_name=feature, list_data=list_data, list_names=list_names, directory = IMAGE_FOLDER)
        
    clear_output()

print("Done!")

# Part 6: Plot of High-Level data

The code below performs a scan of files through the folder containing the high level features and it stores the features as follows:
- indexes are mbid
- each column is the feature
- it has already checked that ids of the files are in the selected_features as well so that we don't waste resources on this.

## DISCLAIMER: this takes a long time (around 1h15 in an i7-6700HQ) if some other genere selection method wants to be used this part must be executed again if the same selection method wants to be done, just skip this cell and load the tsv file from disk

In [None]:
features_to_extract_from_json = [ "aggressive", "happy", "sad", "party", "relaxed","instrumental", "voice", "female", "male"]

labels_to_extract = ["mood_aggressive","mood_happy","mood_sad","mood_party","mood_relaxed","voice_instrumental", "voice_instrumental", "gender", "gender"] 

#print(features_to_extract_from_json,labels_to_extract)
sound_id_highlevel = selected_features.index.tolist()

folder_highlevel_features = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval-train-intersection-highlevel")

highlevel_features = pd.DataFrame(columns = features_to_extract_from_json)

numfolders = sum([1 for _, _, _ in os.walk(folder_highlevel_features)])
i = 1

for subdir, _, files in os.walk(folder_highlevel_features):
    
    starttime = time.clock()
    print("{}/{}".format(str(i),str(numfolders)))
    
    for file in files:
        
        filename, file_extension = os.path.splitext(file)
        
        file_absolute_path = os.path.join(subdir,file)
        
        if file_extension == ".json":
            
            if any(filename in s for s in sound_id_highlevel):
                
                temp_dict = {}
                temp_dict["mbid"] = filename
                
                with open(file_absolute_path) as jsonfile:
                    json_dict = json.load(jsonfile)
                    
                json_dict = json_dict["highlevel"]
                for label,feature in zip(labels_to_extract,features_to_extract_from_json):
                    temp_dict[feature] = json_dict[label]["all"][feature]
                    
                temp_dataframe = pd.DataFrame(temp_dict,index=[0])
                highlevel_features = pd.concat([highlevel_features,temp_dataframe],ignore_index = True,sort = True)
    
    time_expected = (time.clock()-starttime)*(numfolders-i)
    print(str(time_expected))
    i += 1
highlevel_features = highlevel_features.set_index("mbid").sort_index()

Saving the highlevel_features to a tsv file

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval-train-intersection-highlevel-selectedfeatures.tsv")
with open(file,"w") as tsvfile:
    highlevel_features.to_csv(tsvfile,sep = '\t')

Read the highlevel_features from a tsv file

In [None]:
file = os.path.join(DATA_FOLDER,"acousticbrainz-mediaeval-train-intersection-highlevel-selectedfeatures.tsv")
with open(file) as tsvfile:
    highlevel_features = pd.read_csv(tsvfile,sep = '\t',index_col=0,low_memory=False)

Computation of the features in a dictionary of genres as before

In [None]:
#initialize dictionary
information_highlevel = {}

#loop through subgenres to create an entry in the dictionary for each subgenre where in each entry, a dictionary of
#the features will be created in order to get a dictionary of features for each genre in an organised manner.
for i,sub_genre in enumerate(most_frequent):
    
    print("Processing {0} out of {1}".format(i+1,len(most_frequent)))
    
    #temporal variable that copies the lastfm intersected N pandas matrix
    lastfm_sounds_intersected_N_temp = lastfm_sounds_intersected_N.copy()

    #iterate through columns for:
    #- All genres that are not the one in each iteration will be repaced with NaN
    #- All columns and sound ids that are not from that genere get deleted
    for col in lastfm_sounds_intersected_N_temp:
        
        lastfm_sounds_intersected_N_temp[col] = lastfm_sounds_intersected_N_temp[col].str.extract(r"\b^("+sub_genre+r")\b")

    lastfm_sounds_intersected_N_temp = lastfm_sounds_intersected_N_temp.dropna(how='all').dropna(how='all',axis='columns')
    
    #initializing the features dictionary for each subgenre
    information_highlevel[sub_genre] = {}
    
    #for each feature, add the list features for each subgenre to the dictionary
    for feature in list(highlevel_features):
        
        #get only the feature that it's wanted
        temp = highlevel_features[feature].to_frame()
        
        #get only the information_highlevel for the genre for this iteration
        temp = temp.drop(set(temp.index.tolist())-set(lastfm_sounds_intersected_N_temp.index.tolist()))
        
        #add entry to the dictionary
        information_highlevel[sub_genre].update({ feature : temp[feature].tolist() })

    clear_output()

print("Done!")

Create plots for each feature

In [None]:
for feature in list(highlevel_features):
    
    #- get the data and store it in a list of arrays of data synced with the most_frequent features labels
    #- call plotbox function
    list_data = []
    list_names = most_frequent
    for sub_genre in most_frequent:
        list_data.append(information_highlevel[sub_genre][feature])

    plotbox( feature_name=feature, list_data=list_data, list_names=list_names, directory = IMAGE_FOLDER)

## Now the same is done for instrumental voice female and male features

Same procedure as before: scan all the json files searching for the features