# Content-based Recommendation

## Part1: profiling the tracks

In the first part we try to extract features that represents the content of a certain track. We tried two kind of features: lyrics features and audio features. Finally, the lyrics of a song is reduced to a 4-entry profile and the audio features of a song is reduced to a 9-entry profile.

### Part1.1: baseline model for emotion recognition

In this part we looked for a way to represent and model the lyrics so that it can be used to predict emotion of a track. We compared two representations: bag of words + term frequency inverse document frequency vs. raw lyrics + feature hashing. We used these two representations as input to train a classifier that differentiate happy songs and sad songs and compared the performance of different classification models (SVM, random forest, naive Bayes, logistic regression). We finally chose raw lyrics + feature hasing + random forest as our baseline model.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sqlite3
import json
from scipy.sparse                    import csr_matrix, vstack
from sklearn.feature_extraction.text import TfidfTransformer, HashingVectorizer
from sklearn.svm                     import SVC
from sklearn.ensemble                import RandomForestClassifier
from sklearn.naive_bayes             import GaussianNB
from sklearn.linear_model            import LogisticRegression

In [2]:
MXM_PATH  = 'databases/mxm_dataset.db'
TAG_PATH  = 'databases/lastfm_tags.db'
META_PATH = 'databases/track_metadata.db'
WIKI_PATH = 'databases/lyricwiki.db'

conn_mxm  = sqlite3.connect(MXM_PATH)
conn_tag  = sqlite3.connect(TAG_PATH)
conn_meta = sqlite3.connect(META_PATH)
conn_wiki = sqlite3.connect(WIKI_PATH)

#### BOW + TF-IDF

In [3]:
"""Get the list of 5000 words used for musiXmatch's BOW representation"""
def get_mxm_vocab(conn_mxm):
    sql  = "SELECT * FROM words"
    res  = conn_mxm.execute(sql)
    data = res.fetchall()
    mxm_vocab = [t[0] for t in data]
    return mxm_vocab

In [4]:
mxm_vocab = get_mxm_vocab(conn_mxm)
mxm_dict  = {mxm_vocab[i] : i for i in range(len(mxm_vocab))}

In [5]:
"""Get the BOW regresentation of the tracks in the form of a sparse matrix"""
def get_bagofwords(tids, mxm_dict, conn_mxm):
    bows = []
    for tid in tids:
        sql  = "SELECT word, count FROM lyrics WHERE track_id='{}'".format(tid)
        res  = conn_mxm.execute(sql)
        data = res.fetchall()
        col  = np.array([mxm_dict[t[0]] for t in data], dtype=np.int16)
        row  = np.zeros(len(col),                       dtype=np.int16)
        cnt  = np.array([t[1] for t in data] )
        bow  = csr_matrix((cnt, (row, col)), shape=(1, 5000))
        bows.append(bow)
    return vstack(bows)

In [6]:
"""Get the track ids. tagged with a certain tag"""
def get_tids_oftag(tag, conn_tag):
    sql  = """SELECT tids.tid FROM tid_tag, tids, tags 
              WHERE tids.ROWID=tid_tag.tid AND tid_tag.tag=tags.ROWID 
              AND tags.tag='{}'""".format(tag)
    res  = conn_tag.execute(sql)
    data = res.fetchall()
    return [t[0] for t in data]

In [7]:
# Get the BOW representation for tracks labelled with 'happy' and 'sad'.
happy_tids = get_tids_oftag('happy', conn_tag)
happy_bows = get_bagofwords(happy_tids, mxm_dict, conn_mxm)
sad_tids   = get_tids_oftag('sad',   conn_tag)
sad_bows   = get_bagofwords(sad_tids, mxm_dict,   conn_mxm)

In [8]:
# Do the term frequency - inverse document frequency transformation.
all_bows = vstack([happy_bows, sad_bows])
tfidf = TfidfTransformer()
all_bows_tfidf = tfidf.fit_transform(all_bows)
all_bows_tfidf.shape

(24716, 5000)

In [9]:
X = all_bows_tfidf.copy().toarray()
y = ['happy']*(happy_bows.shape[0]) + \
    ['sad']*(sad_bows.shape[0])
y = np.array([1 if t == 'happy' else 0 for t in y])

In [10]:
nonzeros = np.sum(X, axis=1) != 0.0
X_nonzero = X[nonzeros]
y_nonzero = y[nonzeros]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_nonzero, y_nonzero, test_size=0.5)

In [12]:
forest = RandomForestClassifier(
    n_estimators=300, max_features=70, max_depth=None, min_samples_split=2)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=70, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
forest.score(X_train, y_train)

0.984089845577913

In [14]:
forest.score(X_test, y_test)

0.6620399251403618

In [15]:
naive = GaussianNB()
naive.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [16]:
naive.score(X_train, y_train)

0.7622835751052878

In [17]:
naive.score(X_test, y_test)

0.5920149719276356

In [18]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [19]:
logreg.score(X_train, y_train)

0.7719544532834192

In [20]:
logreg.score(X_test, y_test)

0.6781035558328135

In [24]:
# Let's see the top 20 words that indicate happiness
top = np.argsort(logreg.coef_[0])[::-1]
for i in top[:20]: 
    print(mxm_vocab[i])

girl
got
danc
gonna
get
happi
rock
up
about
citi
fine
shine
doo
jump
fun
kiss
hot
sky
check
ya


In [21]:
svc = SVC()
svc.fit(X_train, y_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [22]:
svc.score(X_train, y_train)

0.5484323818437061

In [23]:
svc.score(X_test, y_test)

0.5483468496568933

#### raw lyrics + feature hashing

In [25]:
"""Get the artist and title for tracks with tids"""
def get_artistsandtitles(tids, conn_meta):
    AandTs = []
    for tid in tids:
        sql = "SELECT artist_name, title FROM songs WHERE track_id='{}'".format(tid)
        res = conn_meta.execute(sql)
        AandTs.append(res.fetchall()[0])
    return AandTs

In [26]:
"""Get the raw lyrics of songs with certain artist and title"""
def get_lyrics(AandTs, conn_wiki):
    queryconds = [(t[0].lower().strip(), t[1].lower().strip()) for t in AandTs]
    lyricslist = []
    total_found = 0
    for n, t in enumerate(queryconds):
        res = conn_wiki.execute("""SELECT lyrics FROM songs WHERE artist=? AND title=?""", (t[0], t[1]))
        try:
            lyricslist.append(res.fetchall()[0][0])
            total_found = total_found + 1
        except:
            continue
    return lyricslist

In [27]:
happy_AandTs = get_artistsandtitles(happy_tids, conn_meta)
happy_lyrics = get_lyrics(happy_AandTs, conn_wiki)
sad_AandTs   = get_artistsandtitles(sad_tids,   conn_meta)
sad_lyrics   = get_lyrics(sad_AandTs,   conn_wiki)

In [28]:
def get_nonempty(lyrics):
    nonempty = []
    for l in lyrics:
        if len(l) > 0:
            nonempty.append(l)
    return nonempty

In [29]:
happy_lyrics_nonempty = get_nonempty(happy_lyrics)
sad_lyrics_nonempty   = get_nonempty(sad_lyrics)

In [30]:
vectorizer = HashingVectorizer(n_features=1000)
anotherX   = vectorizer.fit_transform(
    happy_lyrics_nonempty+sad_lyrics_nonempty)

In [31]:
anothery = ['happy']*(len(happy_lyrics_nonempty)) \
    + ['sad']*(len(sad_lyrics_nonempty))

In [32]:
anotherX_train, anotherX_test, anothery_train, anothery_test = train_test_split(
    anotherX, anothery, test_size=0.5)

In [33]:
anotherforest = RandomForestClassifier(
    n_estimators=300, max_features=30, max_depth=None, min_samples_split=2)
anotherforest.fit(anotherX_train, anothery_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=30, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [34]:
anotherforest.score(anotherX_train, anothery_train)

0.9799670044779637

In [35]:
anotherforest.score(anotherX_test, anothery_test)

0.7509131613055261

In [36]:
anothernaive = GaussianNB()
anothernaive.fit(anotherX_train.toarray(), anothery_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [37]:
anothernaive.score(anotherX_train.toarray(), anothery_train)

0.6586141880744756

In [38]:
anothernaive.score(anotherX_test.toarray(), anothery_test)

0.615411806291976

In [39]:
anotherlogreg = LogisticRegression()
anotherlogreg.fit(anotherX_train, anothery_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [40]:
anotherlogreg.score(anotherX_train, anothery_train)

0.7231911383455103

In [41]:
anotherlogreg.score(anotherX_test, anothery_test)

0.6849298927771886

In [42]:
anothersvc = SVC()
anothersvc.fit(anotherX_train, anothery_train)



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [43]:
anothersvc.score(anotherX_train, anothery_train)

0.5136695734150365

In [44]:
anothersvc.score(anotherX_test, anothery_test)

0.5167903852951573

### Part1.2: comparing LSTM with baseline model for emotion recognition

to gain a better classification accuracy, we tried more complex models and representation. Here we compared the performance of word2vec+LSTM against our baseline model. We found that our baseline model out performed this new model in all cases. So we chose our baseline model as our final model for lyrics-based emotion recognition.

#### Training set and Validation set for comparison

In [13]:
import os
import numpy as np
%matplotlib inline
import sqlite3
from sklearn.model_selection import train_test_split
from langdetect              import detect

In [20]:
"""Filter out non-English lyrics"""
def get_english(lyrics):
    english = []
    for l in lyrics:
        try:
            if detect(l) == 'en':
                english.append(l)
        except: continue
    return english

In [21]:
"""Create training set and validation for bi-polar tags."""
def create_train_test(tag1, tag2):
    print('Collecting tids...')
    tag1_tids = get_tids_oftag(tag1, conn_tag)
    tag2_tids = get_tids_oftag(tag2, conn_tag)
    print('Collecting artists and titles...')
    tag1_AandTs = get_artistsandtitles(tag1_tids, conn_meta)
    tag2_AandTs = get_artistsandtitles(tag2_tids, conn_meta)
    print('Collecting lyrics...')
    tag1_lyrics = get_lyrics(tag1_AandTs, conn_wiki)
    tag2_lyrics = get_lyrics(tag2_AandTs, conn_wiki)
    print('{} tracks collected for tag {}, {} tracks collected for tag {}'.format(\
          len(tag1_lyrics), tag1, len(tag2_lyrics), tag2))
    print('Filtering out empty lyrics...')
    tag1_lyrics_nonempty = get_nonempty(tag1_lyrics)
    tag2_lyrics_nonempty = get_nonempty(tag2_lyrics)
    print('Filtering out non-English lyrics...')
    tag1_lyrics_nonempty = get_english(tag1_lyrics_nonempty)
    tag2_lyrics_nonempty = get_english(tag2_lyrics_nonempty)
    print('{} nonempty lyrics for tag {}, {} nonempty lyrics for tag {}'.format(\
          len(tag1_lyrics_nonempty), tag1, len(tag2_lyrics_nonempty), tag2))
    print('Creating predictor set and target set...')
    X = np.array(        tag1_lyrics_nonempty  +         tag2_lyrics_nonempty )
    y = np.array([1]*len(tag1_lyrics_nonempty) + [0]*len(tag2_lyrics_nonempty))
    print('Splitting training set and validation set...')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    print('writing to disk...')
    dirname = tag1+'-'+tag2
    os.mkdir(dirname)
    np.save(dirname + '/X_train.npy', X_train)
    np.save(dirname + '/X_test.npy',  X_test)
    np.save(dirname + '/y_train.npy', y_train)
    np.save(dirname + '/y_test.npy',  y_test)
    print('Finished.')

In [22]:
create_train_test('happy', 'sad')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
8508 tracks collected for tag happy, 9078 tracks collected for tag sad
Filtering out empty lyrics...
Filtering out non-English lyrics...
7724 nonempty lyrics for tag happy, 8300 nonempty lyrics for tag sad
Creating predictor set and target set...
Splitting training set and validation set...
writing to disk...
Finished.


In [23]:
create_train_test('relax', 'Energetic')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
6287 tracks collected for tag relax, 4521 tracks collected for tag Energetic
Filtering out empty lyrics...
Filtering out non-English lyrics...
5213 nonempty lyrics for tag relax, 4068 nonempty lyrics for tag Energetic
Creating predictor set and target set...
Splitting training set and validation set...
writing to disk...
Finished.


In [25]:
create_train_test('cool', 'oldies')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
12457 tracks collected for tag cool, 12769 tracks collected for tag oldies
Filtering out empty lyrics...
Filtering out non-English lyrics...
11064 nonempty lyrics for tag cool, 11970 nonempty lyrics for tag oldies
Creating predictor set and target set...
Splitting training set and validation set...
writing to disk...
Finished.


In [26]:
create_train_test('sweet', 'dark')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
5169 tracks collected for tag sweet, 4768 tracks collected for tag dark
Filtering out empty lyrics...
Filtering out non-English lyrics...
4773 nonempty lyrics for tag sweet, 4228 nonempty lyrics for tag dark
Creating predictor set and target set...
Splitting training set and validation set...
writing to disk...
Finished.


#### Performance of baseline model (feature hashing + random forest) vs new model (word2vec + LSTM)

In [None]:
from keras.models                    import Sequential
from keras.layers                    import Dense, Conv1D, MaxPooling1D, LSTM, Dropout
from keras.layers.embeddings         import Embedding
from keras.preprocessing             import sequence

In [None]:
def train_forest(tag1, tag2):
    dirname = tag1 + '-' + tag2
    vectorizer = HashingVectorizer(n_features=1000)
    
    print('Preprocessing training set...')
    X_train_forest = vectorizer.fit_transform(np.load(dirname + '/X_train.npy'))
    y_train_forest = np.load(dirname + '/y_train.npy')
    print('Preprocessing validation set...')
    X_test_forest  = vectorizer.transform(np.load(dirname + '/X_test.npy'))
    y_test_forest  = np.load(dirname + '/y_test.npy')
    print('Training random forest...')
    forest = RandomForestClassifier(
    n_estimators=300, max_features=30, max_depth=None, min_samples_split=2)
    forest.fit(X_train_forest, y_train_forest)
    print('Finished.')
    return forest, vectorizer, X_train_forest, y_train_forest, X_test_forest, y_test_forest

In [None]:
def show_performance(res):
    print('Training accuracy:', res[0].score(res[2], res[3]))
    print('Validation accuracy:', res[0].score(res[4], res[5]))

In [8]:
def lyrics2intarray(lyrics, tokenizer, stemmer):
    lyrics_tokenized = tokenizer.tokenize(lyrics)
    lyrics_stemmed   = [stemmer.stem(w) for w in lyrics_tokenized]
    L = len(lyrics_stemmed)
    intarray = np.zeros(L, dtype=np.int32)
    for i in range(L):
        intarray[i] = mxm_dict.get(lyrics_stemmed[i], 0)
    return intarray

In [9]:
def train_lstm(tag1, tag2):
    stemmer = SnowballStemmer('english')
    tokenizer = RegexpTokenizer(r'\w+')
    dirname = tag1 + '-' + tag2
    print('Preprocessing training set...')
    X_train_lstm = [lyrics2intarray(l, tokenizer, stemmer) for l in np.load(dirname + '/X_train.npy')]
    y_train_lstm = np.load(dirname + '/y_train.npy')
    print('Preprocessing validation set...')
    X_test_lstm  = [lyrics2intarray(l, tokenizer, stemmer) for l in np.load(dirname + '/X_test.npy')]
    y_test_lstm  = np.load(dirname + '/y_test.npy')
    print('Trimming input...')
    max_lyrics_length = 150
    X_train_lstm = sequence.pad_sequences(X_train_lstm, maxlen=max_lyrics_length)
    X_test_lstm  = sequence.pad_sequences(X_test_lstm,  maxlen=max_lyrics_length)
    print('Training LSTM...')
    embedding_vecor_length = 64
    model = Sequential()
    model.add(Embedding(5001, embedding_vecor_length, input_length=max_lyrics_length))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=64, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(LSTM(50))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.fit(X_train_lstm, y_train_lstm, validation_data=(X_test_lstm, y_test_lstm), epochs=5, batch_size=64)
    return model, X_train_lstm, y_train_lstm, X_test_lstm, y_test_lstm

In [10]:
show_performance(train_forest('happy', 'sad'))

Preprocessing training set...
Preprocessing validation set...
Training random forest...
Finished.
Training accuracy: 0.9726187690147438
Validation accuracy: 0.7759750390015601


In [11]:
train_lstm('happy', 'sad')

Preprocessing training set...
Preprocessing validation set...
Trimming input...
Training LSTM...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 150, 64)           320064    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150, 64)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 150, 64)           12352     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 75, 64)            0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 75, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                23000     
_____________________________________________

(<keras.engine.sequential.Sequential at 0x1103d89b0>,
 array([[   0,    0,    0, ...,   25,   14, 1112],
        [ 141,    2, 1318, ...,   59,    2,  513],
        [  92,    3,   16, ...,  107,   10, 2921],
        ...,
        [ 134,   35,  169, ...,    3,    4,   29],
        [   0,    0,    0, ...,   13,   27,    7],
        [1001,  245,  503, ...,  119,   13,  154]], dtype=int32),
 array([0, 0, 1, ..., 1, 1, 0]),
 array([[   0,    2,    0, ...,    7,   37,  941],
        [ 880,    3,    3, ...,  545, 1748, 1543],
        [  14,  542,   46, ...,   15,    3,   16],
        ...,
        [   0,    0,    0, ..., 1803,   11,  569],
        [  12,    9,    2, ...,  416,  264,  200],
        [  61,   36,   30, ...,  171, 1222,  125]], dtype=int32),
 array([0, 0, 0, ..., 1, 0, 1]))

In [12]:
show_performance(train_forest('relax', 'Energetic'))

Preprocessing training set...
Preprocessing validation set...
Training random forest...
Finished.
Training accuracy: 0.9888200431034483
Validation accuracy: 0.7926763597199784


In [13]:
train_lstm('relax', 'Energetic')

Preprocessing training set...
Preprocessing validation set...
Trimming input...
Training LSTM...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 150, 64)           320064    
_________________________________________________________________
dropout_4 (Dropout)          (None, 150, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 150, 64)           12352     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 75, 64)            0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 75, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                23000     
_____________________________________________

(<keras.engine.sequential.Sequential at 0x1a319659e8>,
 array([[  83,    1,   61, ...,  135,    3,   37],
        [  28, 3422,    2, ...,  976,  523,  172],
        [   0,    0,    0, ...,  254,   59,  390],
        ...,
        [ 267,    4,  251, ...,  336,    6,   92],
        [  53,  476,   28, ...,   65,  175,  175],
        [1298,    2, 1950, ...,  312,   46,   46]], dtype=int32),
 array([0, 1, 0, ..., 1, 0, 0]),
 array([[   0,    0,    0, ...,  147,  131,    3],
        [  48,  103,  476, ...,  885,    0,  106],
        [ 687, 1109,   13, ...,   15,    3,   84],
        ...,
        [4717,  103,  476, ...,  539,   13, 1685],
        [ 399,    1, 1362, ...,   30,   12,  399],
        [   0,  135,    0, ...,    0,  336, 1672]], dtype=int32),
 array([1, 0, 1, ..., 1, 0, 0]))

In [16]:
show_performance(train_forest('cool', 'oldies'))

Preprocessing training set...
Preprocessing validation set...
Training random forest...
Finished.
Training accuracy: 0.9468171704563955
Validation accuracy: 0.7803342739309747


In [17]:
train_lstm('cool', 'oldies')

Preprocessing training set...
Preprocessing validation set...
Trimming input...
Training LSTM...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 150, 64)           320064    
_________________________________________________________________
dropout_10 (Dropout)         (None, 150, 64)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 150, 64)           12352     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 75, 64)            0         
_________________________________________________________________
dropout_11 (Dropout)         (None, 75, 64)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                23000     
_____________________________________________

(<keras.engine.sequential.Sequential at 0x1a3a6c96a0>,
 array([[ 228,  312,   73, ...,  577,   66,  399],
        [  76,  976,  523, ...,  545,   37,  588],
        [ 427,  373,    7, ...,  525,    4,    7],
        ...,
        [1234,   89,    2, ...,    1,   54,  487],
        [   8,  615,    8, ...,    8,  615,    8],
        [ 326,    2,   85, ...,  686,    4,  125]], dtype=int32),
 array([0, 0, 1, ..., 0, 1, 1]),
 array([[  48, 1120,  211, ...,   94,    4,   66],
        [  46,   46,   46, ...,  989,   80,  434],
        [  88,   72,   22, ...,  105,    0,    0],
        ...,
        [1110,  390,    7, ...,  616,  180,    3],
        [   7,  156,    7, ...,  335,    4,   44],
        [   0,    0,    0, ...,   76,    2,  358]], dtype=int32),
 array([1, 0, 1, ..., 1, 0, 0]))

In [18]:
show_performance(train_forest('sweet', 'dark'))

Preprocessing training set...
Preprocessing validation set...
Training random forest...
Finished.
Training accuracy: 0.9870833333333333
Validation accuracy: 0.7679067184897279


In [19]:
train_lstm('sweet', 'dark')

Preprocessing training set...
Preprocessing validation set...
Trimming input...
Training LSTM...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 150, 64)           320064    
_________________________________________________________________
dropout_13 (Dropout)         (None, 150, 64)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 150, 64)           12352     
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 75, 64)            0         
_________________________________________________________________
dropout_14 (Dropout)         (None, 75, 64)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                23000     
_____________________________________________

(<keras.engine.sequential.Sequential at 0x1a33858b70>,
 array([[ 154,   57,    6, ...,    6,  257,   37],
        [  67,    2, 3514, ...,   59,   14, 1255],
        [  17,    2,  563, ...,   50,  283,   50],
        ...,
        [   0,    0,    0, ...,    6,  279,    0],
        [  25, 3795,   49, ...,   41,   11,   73],
        [ 798,  106,    6, ...,   66,  747,  798]], dtype=int32),
 array([0, 1, 0, ..., 0, 1, 0]),
 array([[  61,   11, 3252, ...,  200,   92, 2211],
        [   3,  110,  392, ...,  392,  110,  392],
        [  74,  762,    1, ...,    4,  131,    1],
        ...,
        [   1,   57, 2539, ...,    0,    0,    0],
        [ 290,    6,  138, ...,   92,    1, 3856],
        [ 476,   22,  257, ..., 1110,   22,  251]], dtype=int32),
 array([0, 0, 1, ..., 1, 0, 0]))

### Part1.3: final model for emotion recognition

We built four pipelines for four dimensions of emotion: happy-sad, relax-energetic, cool-oldies, sweet-dark.

In [None]:
def train_forest(X, y):
    vectorizer = HashingVectorizer(n_features=1000)
  
    print('Preprocessing training set...')
    X_train_forest = vectorizer.fit_transform(X)
    y_train_forest = y
    print('Training random forest...')
    forest = RandomForestClassifier(
        n_estimators=300, max_features=30, max_depth=None, min_samples_split=2)
    forest.fit(X_train_forest, y_train_forest)
    print('Finished.')
    return forest, vectorizer

In [6]:
def save_forest_pipeline(res, tag1, tag2):
    pipeline = Pipeline([('vectorizer', res[1]), ('forest', res[0])])
    dump(pipeline, tag1+'-'+tag2+'pipeline.joblib')

In [8]:
def load_forest_pipeline(tag1, tag2):
    return load(tag1+'-'+tag2+'pipeline.joblib')

In [7]:
def create_pipeline(tag1, tag2):
    X, y = create_train(tag1, tag2)
    res = train_forest(X, y)
    save_forest_pipeline(res, tag1, tag2)

In [9]:
create_pipeline('happy', 'sad')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
8508 tracks collected for tag happy, 9078 tracks collected for tag sad
Filtering out empty lyrics...
Filtering out non-English lyrics...
7721 nonempty lyrics for tag happy, 8302 nonempty lyrics for tag sad
Creating predictor set and target set...
Preprocessing training set...
Training random forest...
Finished.


In [10]:
create_pipeline('relax', 'Energetic')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
6287 tracks collected for tag relax, 4521 tracks collected for tag Energetic
Filtering out empty lyrics...
Filtering out non-English lyrics...
5214 nonempty lyrics for tag relax, 4072 nonempty lyrics for tag Energetic
Creating predictor set and target set...
Preprocessing training set...
Training random forest...
Finished.


In [11]:
create_pipeline('cool', 'oldies')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
12457 tracks collected for tag cool, 12769 tracks collected for tag oldies
Filtering out empty lyrics...
Filtering out non-English lyrics...
11062 nonempty lyrics for tag cool, 11969 nonempty lyrics for tag oldies
Creating predictor set and target set...
Preprocessing training set...
Training random forest...
Finished.


In [12]:
create_pipeline('sweet', 'dark')

Collecting tids...
Collecting artists and titles...
Collecting lyrics...
5169 tracks collected for tag sweet, 4768 tracks collected for tag dark
Filtering out empty lyrics...
Filtering out non-English lyrics...
4774 nonempty lyrics for tag sweet, 4225 nonempty lyrics for tag dark
Creating predictor set and target set...
Preprocessing training set...
Training random forest...
Finished.


### Part1.4: profiling the emotion of tracks

We profiled the tracks in the Million Playlist Dataset using the four piplines.

In [None]:
TRACKURI_PATH = 'databases/track_uri.db'
PROFILES_PATH = 'databases/profiles.db'

conn_trackuri = sqlite3.connect(TRACKURI_PATH)
conn_profiles = sqlite3.connect(PROFILES_PATH)

In [None]:
conn_profiles.execute("""CREATE TABLE profiles (
                         track_uri TEXT,
                         happy_sad REAL,
                         relax_energetic REAL,
                         cool_oldies REAL,
                         sweet_dark REAL)""")
conn_profiles.execute("""CREATE INDEX profile_track_uris ON profiles (track_uri)""")
conn_profiles.commit()

In [5]:
"""
Profile the emotion of a song:
[
1: happy, -1:sad;
1: relax, -1:energetic;
1: cool,  -1:oldies;
1: sweet, -1:dark;
]
"""
def make_profile(lyrics, pipelinelist):
    profiles = np.zeros(shape=(len(lyrics), len(pipelinelist)))
    for i, pipeline in enumerate(pipelinelist):
        profiles[:, i] = (pipeline.predict_proba(lyrics)[:, 1] - 0.5) * 2.0
    return profiles

In [None]:
def build_profiles(urilist, conn_trackuri, conn_wiki, 
                   conn_profiles):
    AandTs = get_artistsandtitles_byuri(urilist, conn_trackuri)
    lyrics = get_lyrics(AandTs, conn_wiki)
    urilist_, lyrics_ = get_nonempty(urilist, lyrics)
    urilist_, lyrics_ = get_english(urilist_, lyrics_)
    assert len(urilist_) == len(set(urilist_))
    profiles = make_profile(lyrics_, 
                            [pipeline1, pipeline2, pipeline3, pipeline4])
    insertvals = [(uri, profile[0], profile[1], profile[2], profile[3]) 
                  for uri, profile in zip(urilist_, profiles)]
    conn_profiles.executemany("""INSERT INTO profiles (track_uri, happy_sad, relax_energetic, cool_oldies, sweet_dark)
                             VALUES (?, ?, ?, ?, ?)""", insertvals)
    conn_profiles.commit()

In [1]:
batch_size = 1000
batch_numb = len(all_uris) // batch_size
for i in range(batch_numb+1):
    batch = all_uris[batch_size*i: batch_size*(i+1)]
    build_profiles(batch, conn_trackuri, conn_wiki, conn_profiles)

### Part1.5: profiling the audio features and genre features of tracks

The profiles for audio features are obtained from Million Songs Dataset and the genre features are extracted from the audio featrues using a random forest model. They too were inserted into a relational database for latter use.

In [2]:
AUDIOPROFILES_PATH = 'databases/audio_profiles.db'
GENREPROFILES_PATH = 'databases/genre_profiles.db'

conn_audioprofiles = sqlite3.connect(AUDIOPROFILES_PATH)
conn_genreprofiles = sqlite3.connect(GENREPROFILES_PATH)

In [None]:
audio_df = pd.read_csv('MPD_audio_full.csv')
# later I'll hard-code the table name 'profile' into the code...
# self-reminder: make this table name a convention.
audio_df.to_sql('profiles', con=conn_audioprofiles, index=False)
# build index to accelerate querying
conn_audioprofiles.execute("""CREATE INDEX profile_track_uris ON profiles (track_uri)""")

In [None]:
genre_df = pd.read_csv('MPD_genre.csv')
genre_df.to_sql('profiles', con=conn_genreprofiles, index=False)
conn_genreprofiles.execute("""CREATE INDEX profile_track_uris ON profiles (track_uri)""")

## Part2: Recommendiations based on profiles

The recommendations were generated by finding tracks that has most similar profiles as the input (seeding) tracks. The similarity is defined by cosine similarity and the we achived efficient retrieval of similarity neighbors by putting all our tracks into a KD-tree (It can be proved that when the profiles are normalized, the tracks that have smallest euclidean distances are the tracks that have the highest cosine similarity). To address the problem that the tracks in an input playlist may not be homogeneous, we first perform a clustering on the input tracks and then give recommendations based on each clustering center.

In [121]:
"""functions to query database for titles and profiles"""
def get_titles(seed_urilist):
    titles   = [get_title_byuri(uri) for uri in seed_urilist]
    return [t for t in titles if t is not None]
def get_profiles(seed_urilist, conn_profiles):
    profiles = [get_profile_byuri(uri, conn_profiles) for uri in seed_urilist]
    return np.array([p for p in profiles if p is not None])
def get_title_byuri(uri):
    sql  = "SELECT track_name FROM track_uri WHERE track_uri='{}'".format(uri)
    res  = conn_trackuri.execute(sql)
    try:
        return res.fetchall()[0][0]
    except:
        return None
def get_profile_byuri(uri, conn_profiles):
    sql  = "SELECT * FROM profiles WHERE track_uri='{}'".format(uri)
    res  = conn_profiles.execute(sql)
    try:
        return res.fetchall()[0][1:]
    except:
        return None
def get_all_profiles(conn_profiles, dropnull=False):
    sql  = """SELECT * from profiles"""
    res  = conn_profiles.execute(sql)
    data = res.fetchall()
    if dropnull:
        for p in data:
            if None in p: data.remove(p)
    return data

In [122]:
"""set up kdtree for a certain kind of profiles"""
def build_kdtree(conn_profiles):
    print('Reading profiles from sqlite...')
    all_profiles = get_all_profiles(conn_profiles, dropnull=True)
    print('Creating dictionary got URI ...')
    dict_ind2uri = {i: t[0] for i, t in enumerate(all_profiles)}
    dict_uri2ind = {v: k    for k, v in dict_ind2uri.items()   }
    print('Building KD-tree...')
    profile_values = np.array([t[1:] for t in all_profiles])
    # profiles are normalized so that their euclidian neighbors are also their cosine-similarity neighbors
    profile_values = normalize(profile_values) 
    kdtree = KDTree(profile_values, leaf_size=2)
    return kdtree, dict_ind2uri, dict_uri2ind

In [123]:
"""make recommendation based on single seed profile"""
def get_recm_byprofile(seed_profile, kdtree, dict_ind2uri, exclude_titles, num_recm):
    search_ratio = 2
    while True:
        _,    ind = kdtree.query([seed_profile], k=num_recm*search_ratio)
        recm_list = []
        for recm_ind in ind[0]:
            recm_uri   = dict_ind2uri[recm_ind]
            recm_title = get_title_byuri(recm_uri)
            if recm_title in exclude_titles: continue
            recm_list.append(recm_uri)
            if len(recm_list) == num_recm:   return recm_list
        if len(recm_list) < num_recm: search_ratio *= 2
    return recm_list
"""make recommendation based on multiple seed profiles"""
def create_recommendations(seed_profiles, kdtree, dict_ind2uri, exclude_titles, num_recm=500, num_cntr=None):
    if len(seed_profiles) == 0: return []
    num_seed = len(seed_profiles)
    if num_cntr is None: num_cntr = int(np.sqrt(num_seed))
    kmeans = KMeans(n_clusters=num_cntr).fit(seed_profiles)
    num_recm_per_cntr = int(num_recm/num_cntr)+1
    lists = []
    for c in kmeans.cluster_centers_:
        lists.append( get_recm_byprofile(c, kdtree, dict_ind2uri, exclude_titles, num_recm_per_cntr) )
    recm_list = [None] * num_cntr * num_recm_per_cntr
    for i in range(num_cntr):
        recm_list[i::num_cntr] = lists[i]
    return recm_list[:num_recm]

In [124]:
def generate_val_Y(val_X, conn_profiles, kdtree, dict_ind2uri, num_recm_per_list, verbose=True):
    val_Y = []
    total = len(val_X); every = total // 100
    for i, seedlist in enumerate(val_X):
        titles   = get_titles(seedlist)
        profiles = get_profiles(seedlist, conn_profiles)
        val_Y.append( create_recommendations(profiles, kdtree, dict_ind2uri, 
                                             exclude_titles=titles, num_recm=num_recm_per_list) )
        if verbose and i%every == 0: print('{0:3d} / 100 finished'.format(i//every))
    return val_Y

In [51]:
genrekdtree, genredict_ind2uri, genredict_uri2ind = build_kdtree(conn_genreprofiles)

Reading profiles from sqlite...
Creating dictionary got URI ...
Building KD-tree...


In [None]:
genreval_Y = generate_val_Y(val_X, conn_genreprofiles, genrekdtree, genredict_ind2uri, 500, verbose=True)
with open('val_Y_genre.json', 'w') as f:
    json.dump(genreval_Y, f)

In [63]:
audiokdtree, audiodict_ind2uri, audiodict_uri2ind = build_kdtree(conn_audioprofiles)

Reading profiles from sqlite...
Creating dictionary got URI ...
Building KD-tree...


In [None]:
audioval_Y = generate_val_Y(val_X, conn_audioprofiles, audiokdtree, audiodict_ind2uri, 500, verbose=True)
with open('val_Y_audio.json', 'w') as f:
    json.dump(audioval_Y, f)

In [66]:
lyrickdtree, lyricdict_ind2uri, lyricdict_uri2ind = build_kdtree(conn_lyricprofiles)

Reading profiles from sqlite...
Creating dictionary got URI ...
Building KD-tree...


In [None]:
lyricval_Y = generate_val_Y(val_X, conn_lyricprofiles, lyrickdtree, lyricdict_ind2uri, 500, verbose=True)
with open('val_Y_lyric.json', 'w') as f:
    json.dump(lyricval_Y, f)

In [None]:
def scoring(val_Y, method='linear'):
    scores = []
    length = len(val_Y[0])
    if method == 'linear':
        for l in val_Y:
            score_dict = {}
            for i, t in enumerate(l):
                score_dict[t] = 1-i/length
            scores.append(score_dict)
    elif method == 'exp':
        for l in val_Y:
            score_dict = {}
            for i, t in enumerate(l):
                score_dict[t] = np.exp(-4*i/length)
            scores.append(score_dict)
    else: raise ValueError('method not found')
    return scores

In [None]:
genreval_Y_score = scoring(genreval_Y)
audioval_Y_score = scoring(audioval_Y)
lyricval_Y_score = scoring(lyricval_Y)