In [1]:
import pandas as pd
import sqlite3
from scipy import sparse as sp
import numpy as np
import h5py

## Load relevant data

In [2]:
# with open('/hdd/data/mxm_tids.txt') as f:
#     mxm_tids = [line.strip('\n') for line in f]
    
mxm_matches = pd.read_csv('/hdd/mxm_779k_matches.txt', sep='<SEP>', skiprows=18, header=None)
tid2mxm = dict(mxm_matches[[0, 3]].values)
mxm2tid = {mxm:tid for tid, mxm in tid2mxm.items()}

with h5py.File('/hdd/data/text_feat_test.h5') as hf:
    mxm_tids = hf['features']['ids'][:]

  after removing the cwd from sys.path.


In [3]:
with sqlite3.connect('/hdd/data/lastfm_tags.db') as conn:
    cur = conn.cursor()
    tid_tag = [
        (r[0]-1, r[1]-1, r[2]) for r in
        cur.execute('SELECT * FROM tid_tag').fetchall()
    ]
    tids = [r[0] for r in cur.execute('SELECT * FROM tids').fetchall()]
    tags = [r[0] for r in cur.execute('SELECT * FROM tags').fetchall()]

In [4]:
len(tid_tag)

8598630

### Filter out tracks that matches to the MxM

In [5]:
target_tids = set(tid2mxm).intersection(set(tids))
target_indices = {i for i, tid in enumerate(tids) if tid in target_tids}
tid_tag = [r for r in tid_tag if r[0] in target_indices]

In [6]:
len(tid_tag)

8267618

### Filter out tags that don't belong to the top-50 list

In [15]:
K = 50

tag_pop = {}
for row in tid_tag:
    if row[1] not in tag_pop:
        tag_pop[row[1]] = 1
    else:
        tag_pop[row[1]] += 1

top_tags = [t for t, c in sorted(tag_pop.items(), key=lambda x:-x[1])[:K]]
tid_tag = [r for r in tid_tag if r[1] in top_tags]

In [16]:
len(tid_tag)

1319704

### Swap indices to new ones

In [17]:
unique_tracks = {t:i for i, t in enumerate(target_indices)}
unique_tags = {t:i for i, t in enumerate(top_tags)}
tid_tag = [(unique_tracks[r[0]], unique_tags[r[1]], r[2]) for r in tid_tag]

In [None]:
I, J, V = zip(*tid_tag)
Y = sp.coo_matrix((V, (I, J))).tocsr()
unique_tags = {i:tags[t] for t, i in unique_tags.items()}
unique_tracks = {i:tids[t] for t, i in unique_tracks.items()}

## Save the data

In [23]:
track_list = np.array(
    [unique_tracks[i] for i in range(Y.shape[0])],
    dtype=h5py.special_dtype(vlen=str)
)
tag_list = np.array(
    [unique_tags[i] for i in range(Y.shape[1])],
    dtype=h5py.special_dtype(vlen=str)
)
with h5py.File('/hdd/data/autotagging_data.h5', 'w') as hf:
    hf.create_dataset('data', data=Y.data)
    hf.create_dataset('indices', data=Y.indices)
    hf.create_dataset('indptr', data=Y.indptr)
    hf.create_dataset('tracks', data=track_list)
    hf.create_dataset('tags', data=tag_list)