# Data Enrichment
Goal: Use the computed midi statistics on the MMD dataset and provide the following enhancements to the data:
- add
	- all discogs
		- genre
		- subgenre
	- artist name
	- track name
- replace
	- instrument numbers with instrument names
- recalculate
	- tempo

In [1]:
# import necessary libraries
import pandas as pd
import json
import os
import pretty_midi

# load the stats data
stats = pd.read_csv('../data/statistics_v1.csv')

# load the jsonl files
with open('../data/mmd_matches/MMD_audio_matched_genre.jsonl', 'r') as f:
    genre = [json.loads(line) for line in f]

with open('../data/mmd_matches/MMD_scraped_title_artist.jsonl', 'r') as f:
    names = [json.loads(line) for line in f]

with open('../data/mmd_matches/MMD_scraped_genre.jsonl', 'r') as f:
    scraped_genre = [json.loads(line) for line in f]

In [2]:
# Turn the instrument numbers into intrument names for easier reading
# take a comma separated string of instrument numbers and return a comma separated string of instrument names
def get_instrument_names(instrument_numbers):
    """Take a comma separated string of instrument numbers and return a comma separated string of instrument names"""
    instrument_names = []
    for instrument_number in instrument_numbers.split(', '):
        instrument_names.append(pretty_midi.program_to_instrument_name(int(instrument_number)))
    return ', '.join(instrument_names)

# apply the function to the instrument column
stats['instruments'] = stats['instruments'].apply(get_instrument_names)

stats.head(1)

Unnamed: 0.1,Unnamed: 0,md5,n_instruments,n_unique_instruments,instruments,instrument_families,number_of_instrument_families,n_notes,n_unique_notes,average_n_unique_notes_per_instrument,...,n_key_changes,n_tempo_changes,tempo_estimate,main_time_signature,n_time_signature_changes,track_length_in_seconds,lyrics_nb_words,lyrics_unique_words,lyrics_bool,genre_discogs
0,0,ac45c832a78728aa8822a3df637682c3,14,13,"Acoustic Grand Piano, Pad 1 (new age), FX 1 (r...","Synth Pad, Guitar, Bass, Synth Effects, Chroma...",10,3643,58,11.214286,...,1,4,243,4/4,1,221,223,95,True,electronic


In [3]:
# Create a dataframe with the genre data
genre_dfs = [pd.DataFrame(song).reset_index() for song in genre]

# concatenate genre_dfs and fill null values for missing columns
genre_df = pd.concat(genre_dfs, ignore_index=True)

# remove tagtraum and lastfm information
genre_df = genre_df.drop(['genre_tagtraum', 'genre_lastfm'], axis=1)

# Rename the columns
genre_df.columns = ['genre', 'md5', 'genre_count']

# remove remaining missing values
genre_df = genre_df.dropna()

# Reset df index
genre_df = genre_df.reset_index(drop=True)

# split genre column into genre and style columns
genre_df[['genre', 'style']] = genre_df['genre'].str.split('---', expand=True)

# Drop null values
genre_df = genre_df.dropna()

# insert columns consensus_genre if genre_count is the maximum value for the md5
genre_df['consensus_genre'] = genre_df.groupby('md5')['genre_count'].transform('max') == genre_df['genre_count']

display(genre_df)

# export genre_df to csv
# genre_df.to_csv('../data/mmd_genre_discogs.csv', index=False)

Unnamed: 0,genre,md5,genre_count,style,consensus_genre
1,electronic,ac45c832a78728aa8822a3df637682c3,49.0,synth-pop,True
3,"folk, world, & country",ac45c832a78728aa8822a3df637682c3,1.0,country,False
6,electronic,a8751ddd06d17d27685354258760b4e9,2.0,big beat,True
7,electronic,a8751ddd06d17d27685354258760b4e9,2.0,breaks,True
8,electronic,a8751ddd06d17d27685354258760b4e9,1.0,deep house,False
...,...,...,...,...,...
218793,rock,0e76c737c1f27ac0e5e1e7455a63c8ec,1.0,death metal,True
218794,rock,0e76c737c1f27ac0e5e1e7455a63c8ec,1.0,folk metal,True
218796,jazz,0559531dd9382e9f7315011549e5de50,1.0,easy listening,True
218799,rock,02a04218c03da2ecbfc06b1a73717045,1.0,doom metal,True


In [4]:
# Create a dataframe with the names data
names_df = pd.DataFrame(names)

def get_artist_and_titles(row):
    """Extract the artist and title from the scraped data"""
    try:
        artist = row[0][1]
        title = row[0][0]
        return artist, title
    except Exception:
        artist = row[0][0]
        title = 'unknown'
        return artist, title

# Get first value from the title_artist column
title_artist = names_df['title_artist'].apply(get_artist_and_titles)

# add title and artist to the dataframe
names_df['artist'] = title_artist.apply(lambda x: x[0])
names_df['title'] = title_artist.apply(lambda x: x[1])
names_df.drop('title_artist', axis=1, inplace=True)

names_df

# export names_df to csv
# names_df.to_csv('../data/mmd_title_artist.csv', index=False)

Unnamed: 0,md5,artist,title
0,be51e8259904879e561850e140e997dd,Henry Thomas Smart,'Tis break of day
1,272257791018b3c151029b49734b26a6,Fabio Fresi,Ave Maria
2,2596615b7dc995ff733995786fb6dc64,Heinrich Schütz,Lukaspassion
3,17b545f91caf271920727ee61f21e494,Johann Sebastian Bach,"An Wasserflüssen Babylon, BWV 267"
4,4f8ee1160e87907d56d864b5dd75b61a,Heinrich Schütz,Matthäuspassion
...,...,...,...
221499,bc4efc1870e5c27ff6cbfe782adb6452,Who,Behind Blue Eyes
221500,17622f3371012cea61a29c7703006bef,Who,Eminence Front
221501,5cdece727722c41ad3b6757ff9f963d5,Who,Eminence Front2
221502,4c15a1200083f9523be9fa4c62d4edc3,Wild Thing,


In [5]:
names_df.head()

Unnamed: 0,md5,artist,title
0,be51e8259904879e561850e140e997dd,Henry Thomas Smart,'Tis break of day
1,272257791018b3c151029b49734b26a6,Fabio Fresi,Ave Maria
2,2596615b7dc995ff733995786fb6dc64,Heinrich Schütz,Lukaspassion
3,17b545f91caf271920727ee61f21e494,Johann Sebastian Bach,"An Wasserflüssen Babylon, BWV 267"
4,4f8ee1160e87907d56d864b5dd75b61a,Heinrich Schütz,Matthäuspassion


In [6]:
# Create a dataframe with the scraped genre data
scraped_genre_df = pd.DataFrame(scraped_genre)

# turn values of genre column into list and explode the list
scraped_genre_df['genre'] = scraped_genre_df['genre'].apply(lambda x: x[0])
scraped_genre_df = scraped_genre_df.explode('genre')

# merge the names and scraped genre dataframes
name_genres_df = names_df.merge(scraped_genre_df, on='md5', how='left')

# drop rows where artist is null
name_genres_df = name_genres_df.dropna(subset=['artist'])

name_genres_df

# count distinct md5
name_genres_df['md5'].nunique()

# export name_genres_df to csv
# name_genres_df.to_csv('../data/mmd_scraped_artist_genre.csv', index=False)

221504

In [7]:
# merge the genre_df and names_df
discogs_genre_names = pd.merge(genre_df, names_df, on='md5')

# keep only rows where the consensus_genre is True
discogs_genre_names = discogs_genre_names[discogs_genre_names['consensus_genre'] == True]

# export merged_df to csv
# discogs_genre_names.to_csv('../data/mmd_discogs_genre_with_artists.csv', index=False)

discogs_genre_names

Unnamed: 0,genre,md5,genre_count,style,consensus_genre,artist,title
0,electronic,a8751ddd06d17d27685354258760b4e9,2.0,big beat,True,Bloodbath,Buried By The Dead (2)
1,electronic,a8751ddd06d17d27685354258760b4e9,2.0,breaks,True,Bloodbath,Buried By The Dead (2)
3,rock,a9544a68c5ca8e32c224f25d5a06619d,29.0,punk,True,Ramones (The),BlitzKrieg Bop (2)
5,rock,a077f8d661f6853d92f47c2e08b723b9,4.0,indie rock,True,Pixies,The Happening
9,rock,976ba9b2721959b1a733ae3fbf3bd50c,5.0,hard rock,True,Supergrass,Grace
...,...,...,...,...,...,...,...
59008,electronic,0e59b6273b770ea36e53f73612f16d7f,1.0,tech house,True,Beherit,Sadomatic_Rites
59009,rock,054ec142fbe2338acb7c8a743fe64928,1.0,southern rock,True,Madonna,Over and Over.1
59010,jazz,0559531dd9382e9f7315011549e5de50,1.0,easy listening,True,Hank Williams,Too Many Parties and Too Many Pals
59011,rock,02a04218c03da2ecbfc06b1a73717045,1.0,doom metal,True,"Umbreit, Karl Gottlieb",24 Orgelstücke verschiedener Art


In [8]:
# merge the genre_df and names_df
merged_df_full = pd.merge(genre_df, names_df, on='md5', how='outer')

# Merge the stats and merged_df_full 
stats_enriched = pd.merge(stats, merged_df_full, on='md5', how='left')

# drop unnamed column
stats_enriched = stats_enriched.drop('Unnamed: 0', axis=1)

# when the genre is not available, use the genre from genre_discogs
stats_enriched['genre'] = stats_enriched['genre'].fillna(stats_enriched['genre_discogs'])

# when consensus_genre is not available, fill with True
stats_enriched['consensus_genre'] = stats_enriched['consensus_genre'].fillna(True)

# remove genre_discogs column
stats_enriched = stats_enriched.drop('genre_discogs', axis=1)

# Keep only the rows where consensus_genre is True
stats_enriched = stats_enriched[stats_enriched['consensus_genre'] == True]

stats_enriched


# Export stats_enriched to csv
# stats_enriched.to_csv('../data/statistics_v2.csv', index=False)

Unnamed: 0,md5,n_instruments,n_unique_instruments,instruments,instrument_families,number_of_instrument_families,n_notes,n_unique_notes,average_n_unique_notes_per_instrument,average_note_duration,...,track_length_in_seconds,lyrics_nb_words,lyrics_unique_words,lyrics_bool,genre,genre_count,style,consensus_genre,artist,title
0,ac45c832a78728aa8822a3df637682c3,14,13,"Acoustic Grand Piano, Pad 1 (new age), FX 1 (r...","Synth Pad, Guitar, Bass, Synth Effects, Chroma...",10,3643,58,11.214286,0.605142,...,221,223,95,True,electronic,49.0,synth-pop,True,,
2,ac547baef487103fd9de9f49d2f6e204,15,12,"Lead 2 (sawtooth), Acoustic Grand Piano, Brass...","Piano, Ensemble, Brass, Organ, Percussive, Syn...",9,8167,55,8.200000,0.109172,...,264,0,0,False,electronic,54.0,disco,True,,
10,a861dcbbbcece2ba042648c20d3425a8,10,10,"Acoustic Guitar (steel), Lead 3 (calliope), Ro...","Piano, Synth Lead, Bass, Guitar, Organ",5,7691,47,13.600000,0.535591,...,305,534,176,True,rock,6.0,southern rock,True,,
12,a077f8d661f6853d92f47c2e08b723b9,5,5,"String Ensemble 1, Electric Bass (finger), Ele...","Piano, Bass, Guitar, Ensemble",4,3386,30,7.800000,0.378581,...,257,0,0,False,rock,4.0,indie rock,True,Pixies,The Happening
13,aa42e141c3e3ff1f46e9bc676ca61ebb,2,2,"Lead 6 (voice), Pad 4 (choir)","Synth Lead, Synth Pad",2,3290,33,33.000000,0.123359,...,201,0,0,False,classical,11.0,baroque,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60926,0e76c737c1f27ac0e5e1e7455a63c8ec,6,6,"Electric Bass (pick), Electric Piano 2, Trombo...","Bass, Brass, Piano, Ensemble, Organ",5,2676,56,18.833333,0.331527,...,344,0,0,False,rock,1.0,death metal,True,,
60927,0e76c737c1f27ac0e5e1e7455a63c8ec,6,6,"Electric Bass (pick), Electric Piano 2, Trombo...","Bass, Brass, Piano, Ensemble, Organ",5,2676,56,18.833333,0.331527,...,344,0,0,False,rock,1.0,folk metal,True,,
60928,0559531dd9382e9f7315011549e5de50,7,6,"Acoustic Guitar (steel), Acoustic Grand Piano,...","Piano, Organ, Bass, Guitar",4,3917,45,20.000000,0.504030,...,206,0,0,False,jazz,1.0,easy listening,True,Hank Williams,Too Many Parties and Too Many Pals
60929,02a04218c03da2ecbfc06b1a73717045,2,2,"Flute, French Horn","Pipe, Brass",2,723,51,31.500000,0.590501,...,133,0,0,False,rock,1.0,doom metal,True,"Umbreit, Karl Gottlieb",24 Orgelstücke verschiedener Art


In [9]:
import plotly.express as px

# count number of styles per genre
genre_styles = discogs_genre_names.groupby('genre')['style'].value_counts().reset_index(name='count')

# keep rows where the count is more than 100
genre_styles = genre_styles[genre_styles['count'] > 100]

# plot the count of styles per genre as a vetical bar chart with plotly
fig = px.bar(genre_styles, y='style', x='count', color='genre', title='Styles per genre', height=2000, width=1000)

# show counts on figure
fig.update_traces(texttemplate='%{value:.2s}', textposition='outside')

fig.show()

# export plot to png
fig.write_image("../data/styles_per_genre.png")

genre_styles

Unnamed: 0,genre,style,count
10,classical,baroque,759
11,classical,classical,754
12,classical,romantic,682
13,classical,modern,412
14,classical,contemporary,214
...,...,...,...
229,rock,doom metal,129
230,rock,southern rock,121
231,rock,hardcore,104
276,stage & screen,soundtrack,372


In [14]:
name_genres_df

# count unique number of md5 per artist and genre
artist_genre = name_genres_df.groupby(['artist', 'genre'])['md5'].nunique().reset_index(name='count')

# order by highest count
artist_genre = artist_genre.sort_values(by='count', ascending=False)

# export to csv
artist_genre.to_csv('../data/artist_genre.csv', index=False)

artist_genre.nunique()

artist    15278
genre       130
count       218
dtype: int64