In [1]:
import os
import pandas
import numpy as np
from gensim.models import KeyedVectors

In [2]:
# input
IN_PATH = 'embeddings/lakh.emb'
CSV_PATH = 'embeddings/lakh.csv'

# output
IMG_PATH = './img-lakh'
DATASET_PATH = './dataset-lakh'

In [3]:
itl = pandas.read_csv(CSV_PATH, index_col=0)
itl = itl.replace(np.nan, '?', regex=True)
itl.head()

Unnamed: 0,id,song_name,album_name,artist_name,artist_mb,tag_echo,tag_mbz,year
0,R/R/U/TRRRUFD12903CD7092,Wastelands,Alien 4,Hawkwind,5a28f8c2-31fb-4047-ae57-c5c326989262,space rock,british,1994
1,R/R/U/TRRRUTV12903CEA11B,Runaway,Songs of Del Shannon,Del Shannon,2e885bfb-1f59-49cf-8d51-e743445e1b48,ballad,classic pop and rock,1961
2,R/R/U/TRRRUJO128E07813E7,Have You Met Miss Jones? (Swing When Version),Swing When You're Winning,Robbie Williams,db4624cf-0e44-481e-a9dc-2142b833ec2f,british pop,pop,2001
3,R/R/I/TRRRIYO128F428CF6F,Goodbye,Bittersweet,Volebeats,eb567c55-368d-4b85-b969-ca9e3252f9cb,alternative country,?,0
4,R/R/I/TRRRILO128F422FFED,La Colegiala,Musica Tropical De Colombia 5,Rodolfo Y Su Tipica Ra7,ead8d6d9-e58b-4dd8-916f-cf7f359db38e,cumbia,?,1997


We try to merge strings with similar values (e.g. "hip-hop" and "hip hop") using the Levensthein distance

In [4]:
def merge_near_string(what, distance=1):
    uq = itl[what].unique()

    import distance

    replace_list = {}

    for w1 in uq:
        if len(w1) < 5:
            continue
        if w1 in replace_list:
            continue
        for w2 in uq:
            if len(w2) < 5:
                continue
            if w1 == w2:
                continue
            if distance.levenshtein(w1,w2) > 1:
                continue
            if w2 in replace_list:
                continue

            replace_list[w2] = w1
    
    for key, value in replace_list.items():
        itl[what] = itl[what].replace(key, value)
    return replace_list

In [5]:
merge_near_string('tag_mbz')

{'funk rock': 'punk rock',
 'electronica': 'electronic',
 'italia': 'italian',
 'hip hop': 'hip-hop',
 'hiphop': 'hip-hop',
 'australian': 'australia',
 'australie': 'australia',
 'france': 'trance',
 'synth-pop': 'synthpop',
 'orchestral': 'orchestra',
 'popera': 'opera',
 'austria': 'austrian',
 'dark wave': 'darkwave',
 'post rock': 'post-rock',
 'argentine': 'argentina',
 'post-hardcore': 'post hardcore',
 'electropop': 'electro pop'}

In [6]:
merge_near_string('tag_echo')

{'hip pop': 'hip hop',
 'electronica': 'electronic',
 'freakbeat': 'breakbeat',
 'new rave': 'new wave'}

In [7]:
itl['ids'] = itl['id'].apply(lambda x: x.split('/')[-1])
itl.head()

Unnamed: 0,id,song_name,album_name,artist_name,artist_mb,tag_echo,tag_mbz,year,ids
0,R/R/U/TRRRUFD12903CD7092,Wastelands,Alien 4,Hawkwind,5a28f8c2-31fb-4047-ae57-c5c326989262,space rock,british,1994,TRRRUFD12903CD7092
1,R/R/U/TRRRUTV12903CEA11B,Runaway,Songs of Del Shannon,Del Shannon,2e885bfb-1f59-49cf-8d51-e743445e1b48,ballad,classic pop and rock,1961,TRRRUTV12903CEA11B
2,R/R/U/TRRRUJO128E07813E7,Have You Met Miss Jones? (Swing When Version),Swing When You're Winning,Robbie Williams,db4624cf-0e44-481e-a9dc-2142b833ec2f,british pop,pop,2001,TRRRUJO128E07813E7
3,R/R/I/TRRRIYO128F428CF6F,Goodbye,Bittersweet,Volebeats,eb567c55-368d-4b85-b969-ca9e3252f9cb,alternative country,?,0,TRRRIYO128F428CF6F
4,R/R/I/TRRRILO128F422FFED,La Colegiala,Musica Tropical De Colombia 5,Rodolfo Y Su Tipica Ra7,ead8d6d9-e58b-4dd8-916f-cf7f359db38e,cumbia,?,1997,TRRRILO128F422FFED


In [8]:
ids = itl['ids'].values

midi_embedding = KeyedVectors.load_word2vec_format(IN_PATH)
uris = list(filter(lambda x: x in ids, midi_embedding.index2entity))

vectors = [midi_embedding.get_vector(k) for k in uris]

In [9]:
# function that returns a property given the id
def extract(what, x, t=None):
#     print(itl[itl['ids']==x])
    item = itl[itl['ids']==x][what].values[0]
    if t and type(item) != t:
        return '?'
    return np.str(item)

Clean the data for Tensorflow

In [10]:
def vec_to_string(vector):
    return ' '.join([str(v) for v in vector])

In [11]:
if not os.path.exists(DATASET_PATH):
    os.mkdir(DATASET_PATH)

with open('%s/id.txt' % DATASET_PATH, 'w') as f: 
    f.write('\n'.join(uris)) 

with open('%s/vectors.txt' % DATASET_PATH, 'w') as f: 
    f.write('\n'.join([vec_to_string(vector) for vector in vectors])) 
    
with open('%s/tag_mbz.txt' % DATASET_PATH, 'w') as f: 
    f.write('\n'.join([extract('tag_mbz', uri, t=np.str) for uri in uris])) 

with open('%s/tag_echo.txt' % DATASET_PATH, 'w') as f: 
    f.write('\n'.join([extract('tag_echo', uri, t=np.str) for uri in uris]))
    
with open('%s/year.txt' % DATASET_PATH, 'w') as f: 
    f.write('\n'.join([extract('year', uri) for uri in uris]))