In [1]:
import sys
sys.path.append('..')

import numpy as np

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score

from lyricpsych.utils import load_csr_data, prepare_feature
from lyricpsych.files import mxm2msd as mxm2msd_fn

mxm2msd = dict(
    [line.strip('\n').split(',') for line in open(mxm2msd_fn())]
)
msd2mxm = {msd:mxm for mxm, msd in mxm2msd.items()}

In [2]:
labels, tracks, tags = load_csr_data(
    '/hdd/data/autotagging_data.h5', 'tracks', 'tags'
)

In [3]:
# prepare feature
feature, track2id, pca, sclr = prepare_feature(
    '/hdd/data/text_feat_test.h5', None
)

In [4]:
X = np.concatenate(
    [feature['linguistic'], feature['personality'], feature['topic']],
    axis=1
)
X = X[[track2id[t] for t in tracks]]
y = labels
y.data[:] = 1
y = y.toarray()

whitelist = np.where(y.sum(1) > 0)[0]
X = X[whitelist]
y = y[whitelist]

In [5]:
rnd_idx = np.random.permutation(X.shape[0])
n_train = int(X.shape[0] * 0.8)
Xtr, Xts = X[rnd_idx[:n_train]], X[rnd_idx[n_train:]]
ytr, yts = y[rnd_idx[:n_train]], y[rnd_idx[n_train:]]

In [None]:
nn = MLPClassifier((100,), learning_rate_init=0.001, learning_rate='adaptive',
                   early_stopping=True, alpha=0.01)
nn.fit(Xtr, ytr)
p = nn.predict_proba(Xts)

In [7]:
rf = OneVsRestClassifier(RandomForestClassifier(100))
rf.fit(Xtr, ytr)
# p = np.concatenate(
#     [p_[:, 1][:, None] for p_ in rf.predict_proba(Xts)],
#     axis=1
# )
p = rf.predict_proba(Xts)

In [None]:
ovr = OneVsRestClassifier(LogisticRegression(solver='lbfgs'))
ovr.fit(Xtr, ytr)
p = ovr.predict_proba(Xts)

In [8]:
auc_tag = roc_auc_score(yts, p, 'macro')
auc_trk = roc_auc_score(yts, p, 'samples')

print('AUC_c: {:.4f} - AUC_t: {:.4f}'.format(auc_tag, auc_trk))

AUC_c: 0.6121 - AUC_t: 0.7187
