In [1]:
import yaml
import json 
import collections

In [2]:
def flatten_tree(elem, path, branches):
    """Flatten nested lists/dictionaries into lists of strings
    (branches).
    """
    if not path:
        path = []

    if isinstance(elem, dict):
        for (k, v) in elem.items():
            flatten_tree(v, path + [k], branches)
    elif isinstance(elem, list):
        for sub in elem:
            flatten_tree(sub, path, branches)
    else:
        branches.append(path + [str(elem)])

def load_genre_taxonomy(taxonomy_file):
    import yaml
    genres_tree = yaml.load(open(taxonomy_file, 'r'))
    genres = []
    flatten_tree(genres_tree, [], genres)
    
    # Build a genre dictionary from the genre parent list.
    # This is {'subgenre': parent, 'subsubgenre': parent}
    genre_d = {}
    for g in genres:
        d = collections.deque(reversed(g))
        while True:
            try:
                g = d.popleft()
                r = list(d)
                if not r:
                    r = [g]
                genre_d[g] = r[-1]
            except IndexError:
                break
    return genre_d

In [3]:
genres_file = 'jamendo-genres-original.json'

original_genres = json.load(open(genres_file))
taxonomy = load_genre_taxonomy("jamendo-beets-genres-tree.yaml")

In [4]:
data = dict() 

for idx, tags in original_genres.items():
    vals = [[taxonomy[tag], '100']
             for tag in tags if tag in taxonomy.keys()]
    if vals:
        data[idx] = vals


json.dump(data, open('jamendo-genres-clean.json', 'w'))

In [5]:
import numpy as np
idxs = list(data.keys())
for i in np.random.randint(0, len(idxs), 10):
    print('#'*20)
    print('source')
    print(original_genres[idxs[i]])
    
    print('target')
    print(data[idxs[i]])
    
    

####################
source
['classical']
target
[['classical', '100']]
####################
source
['soundtrack', 'electronic', 'electropop', 'ambient']
target
[['electronic', '100'], ['electronic', '100'], ['electronic', '100']]
####################
source
['electronic', 'instrumentalpop']
target
[['electronic', '100']]
####################
source
['soundtrack', 'electronic', 'easylistening', 'ambient']
target
[['electronic', '100'], ['easy listening', '100'], ['electronic', '100']]
####################
source
['electronic', 'ambient']
target
[['electronic', '100'], ['electronic', '100']]
####################
source
['pop']
target
[['pop', '100']]
####################
source
['pop', 'rock', 'poprock']
target
[['pop', '100'], ['rock', '100'], ['rock', '100']]
####################
source
['atmospheric', 'choir', 'darkambient']
target
[['electronic', '100']]
####################
source
['electronic', 'lounge', 'ambient']
target
[['electronic', '100'], ['electronic', '100']]
############