In [1]:
import pandas as pd
import numpy as np

tracks_df = pd.read_csv('../data/tracks_genres_lyrics_en.csv.zip')
tracks_df.drop(columns=['Unnamed: 0'], inplace=True)
tracks_df = tracks_df[tracks_df.genres.isna() == False]

In [2]:
display(tracks_df.shape)
tracks_df.head()

(153362, 23)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,lyrics,genres,lang
1,08y9GfoqCWfOGsKdwojr5e,Lady of the Evening,0,163080,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.402,0.158,...,0.039,0.989,0.13,0.311,0.196,103.22,4,,"adult standards,big band,easy listening,lounge...",en
3,1HXdv1z9RlvrcUernyf0MY,The Dear Little Shamrock,0,191613,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.347,0.197,...,0.0504,0.991,0.000133,0.196,0.246,109.338,3,,"adult standards,big band,easy listening,lounge...",en
4,1O9iZyzufN1fUdVO97mmm5,How High the Moon,0,175333,0,"Dick Haymes,Harry James,His Orchestra","3BiJGZsyX9sJchTqcSA7Su,5MpELOfAiq7aIBTij30phD,...",1922-01-01,0.328,0.307,...,0.0484,0.989,0.277,0.21,0.259,117.225,4,,"adult standards,big band,easy listening,lounge...",en
7,2IMcezGQzHgqH8gIvXJM3Q,Till the End of Time,0,182227,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.208,0.234,...,0.0426,0.986,0.656,0.234,0.209,85.288,3,,"adult standards,big band,easy listening,lounge...",en
8,38u3k7zUGTp48G1oA3ky3R,The Girl That I Marry,0,182920,0,Dick Haymes,3BiJGZsyX9sJchTqcSA7Su,1922-01-01,0.185,0.151,...,0.0391,0.985,0.798,0.151,0.135,88.945,3,,"adult standards,big band,easy listening,lounge...",en


In [3]:

tracks_df['name_cmplx'] = tracks_df.name + tracks_df.artists.apply(lambda x: ' '+x+' ') 
tracks_df.name_cmplx

1                          Lady of the Evening Dick Haymes 
3                     The Dear Little Shamrock Dick Haymes 
4         How High the Moon Dick Haymes,Harry James,His ...
7                         Till the End of Time Dick Haymes 
8                        The Girl That I Marry Dick Haymes 
                                ...                        
170520                       King - Acoustic Years & Years 
170521                     Fix It to Break It Clinton Kane 
170523                  remember the mornings Clinton Kane 
170526                   What They'll Say About Us FINNEAS 
170527            A Day At A Time Gentle Bones,Clara Benin 
Name: name_cmplx, Length: 153362, dtype: object

In [4]:
#Tokenize name_cmplx

from re import compile as rcompile

rex = rcompile('[^a-zA-Z 0-9]')

tokenize = lambda x: rex.sub('', x.lower().replace(',', ' ').replace('-',' '))

tracks_df['name_cmplx_tokens'] = tracks_df.name_cmplx.apply(tokenize)


In [5]:
display(tracks_df.name_cmplx_tokens)

1                          lady of the evening dick haymes 
3                     the dear little shamrock dick haymes 
4         how high the moon dick haymes harry james his ...
7                         till the end of time dick haymes 
8                        the girl that i marry dick haymes 
                                ...                        
170520                        king   acoustic years  years 
170521                     fix it to break it clinton kane 
170523                  remember the mornings clinton kane 
170526                    what theyll say about us finneas 
170527            a day at a time gentle bones clara benin 
Name: name_cmplx_tokens, Length: 153362, dtype: object

In [6]:
data = tracks_df.name_cmplx_tokens.to_list()


In [7]:
display(data[:10])

['lady of the evening dick haymes ',
 'the dear little shamrock dick haymes ',
 'how high the moon dick haymes harry james his orchestra ',
 'till the end of time dick haymes ',
 'the girl that i marry dick haymes ',
 'all or nothing at all dick haymes harry james his orchestra ',
 'i ought to know more about you dick haymes ',
 'soft lights and sweet music dick haymes ',
 'hush a bye wee rose of kilarney dick haymes victor young his orchestra ',
 'serenade of the bells dick haymes ']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english',
                        min_df=7,
                       )
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(data)

features = tfidf.get_feature_names()
#display(len(features), features[:50])

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=features)
print(dtm.shape)
display(dtm.head())



(153362, 12426)


Unnamed: 0,000,008,01,010,015b,02,03,04,049,05,...,zorba,zschech,zubin,zucchero,zucker,zukerman,zulu,zulus,zynthetic,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, Nadam

ishape = dtm.shape[1]
# Create Model 
input_img = Input(shape=(ishape, ))

x = Dense(1024)(input_img)

x = Dense(256)(x)

x = Dense(128)(x)

encoded = Dense(64)(x)

x = Dense(128)(encoded)

x = Dense(256)(x)

x = Dense(1024, activation='sigmoid')(x)
decoded = Dense(ishape, activation='sigmoid')(x)


rmodel = Model(input_img, decoded)
rmodel.compile(loss='mse', optimizer=Adam(learning_rate=0.01))

rmodel.fit(dtm, dtm, batch_size=512, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fcb1826fe80>

In [None]:
encoder = Model(input_img, encoded)

encoded_dtm = encoder.predict(dtm)

In [None]:
from sklearn.neighbors import NearestNeighbors

# Fit on DTM
nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
nn.fit(encoded_dtm)


In [None]:
def find_song_entries(x):
    vec = tfidf.transform([tokenize(x)]).todense()
    encoded_vec = encoder.predict(vec)
    entries = nn.kneighbors(encoded_vec)[1][0].tolist()
    entries = tracks_df.iloc[entries].popularity.sort_values(ascending=False).index.tolist()
    return tracks_df.loc[entries]

In [None]:
def find_song_entry(sugg_str, best_choice=True):
    
    df = find_song_entries(sugg_str)
    
    sugg_set = set(tokenize(sugg_str).split())
    
    choice = df.index.tolist()
    
    if best_choice:
        name_artists = lambda x: set(tokenize(df.loc[x]['name']+' '+df.loc[x].artists).split())
        score_func = lambda x: len(sugg_set.intersection(x))
        
        choices = [(y,name_artists(y)) for y in choice]
        best_idx = 0
        best_score = score_func(choices[0][1])
        for idx,nm_art in enumerate(choices[1:]):
            score = score_func(nm_art[1])
            #print(f'{choices[idx+1][1]}/{choices[best_idx][1]}/{sugg_set}:: {score}/{best_score}')
            if score > best_score:
                best_score = score
                best_idx = idx+1

        choice = choices[best_idx][0]
            
    return df.loc[choice]

In [None]:
from joblib import dump

dump(tfidf, '../models/tfidf.pkl')
encoder.save('../models/encoder.h5')
dump(encoded_dtm, '../models/encoded_dtm.pkl')

In [None]:
test_vecs = ["be happy  mcferrin", "Fast Chapman", "Uptown Funk  Mars ", "I'm yours Jason Mraz", "Walk like an egyptian bangles", "Manic Monday", "Last Christmas Wham"]
#test_vecs = ["I'm yours Jason Mraz"]

for t in test_vecs:
    display(find_song_entry(t, best_choice=False))
    display(find_song_entry(t))

