# Genre selection algorithm
This algorithm is the first half of the recommendation engine. I would venture to say **this is the crucial part of the engine**, since two songs from different genres might have similar characteristics, but will not sound alike.  
The process of selecting relevant genres is presented below on example listening history.  
Cosine similarity between the names of the genres is calculated utilising TF-IDF matrix. An assumption was made that similarity between genres will be reflected in similarity of their names.

In [1]:
import polars as pl
import numpy as np
from IPython.display import display

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
spotify = pl.read_csv('spotify-dataset-combined.csv')
all_genres = spotify[:, 'track_genre'].unique()

# ids.append( spotify.filter(pl.col('track_name') == 'TITLE_HERE')[0,0] )
ids = [
	'6DCZcSspjsKoFjzjrWoCdn', '2csZLnXBMuw6ETZuRxdUZF', '7o2AeQZzfCERsRmOM86EcB',
	'2jJIxT6WAIVWzDo3Ou53Q0', '2aJDlirz6v2a4HREki98cP', '6RpuNjTsUQo7eByN3kWvYx',
	'5x9lyB5lP2lXR0zb7L6RaB', '6nGhGBz7uaE1RngnIFRKep', '7jEwBMtA7gM43NxiuvfF3h',
	'3TjliM0xQ3fkza0RpINRrq', '12Ypr3PCVJ2i7Uwz93q1Vl', '60Pe9j2pCBa4Zp4ztf5nhd'
]

queried_songs = spotify.filter(pl.col('track_id').is_in(ids))
display(queried_songs.select(('track_id', 'artists', 'album_name', 'track_name', 'track_genre')))

queried_genres = queried_songs[:,'track_genre']

track_id,artists,album_name,track_name,track_genre
str,str,str,str,str
"""2aJDlirz6v2a4H...","""Bicep""","""Bicep""","""Glue""","""ambient"""
"""7o2AeQZzfCERsR...","""Aphex Twin""","""Selected Ambie...","""Xtal""","""ambient"""
"""6nGhGBz7uaE1Rn...","""Bicep""","""Meli (II)""","""Meli (II)""","""ambient"""
"""60Pe9j2pCBa4Zp...","""Aphex Twin""","""Windowlicker""","""Windowlicker""","""ambient"""
"""7jEwBMtA7gM43N...","""the peggies""","""Hell like Heav...","""君のせい""","""anime"""
"""12Ypr3PCVJ2i7U...","""Gesaffelstein""","""Conspiracy Pt....","""Opr""","""club"""
"""2jJIxT6WAIVWzD...","""Gesaffelstein""","""Aleph""","""Pursuit""","""club"""
"""5x9lyB5lP2lXR0...","""Woesum;Bladee;...","""Blue Summer""","""Violet Gold""","""club"""
"""3TjliM0xQ3fkza...","""Crystal Castle...","""Crystal Castle...","""Crimewave""","""hardcore"""
"""6RpuNjTsUQo7eB...","""Crystal Castle...","""Crystal Castle...","""Untrust Us""","""hardcore"""


In [3]:
queried_genres = pl.concat((
    queried_genres,
    all_genres.filter(~pl.col('track_genre').is_in(queried_genres['track_genre']))
    # excluding without unique() so as to preserve frequency of queried genres
))
queried_genres

track_genre
str
"""ambient"""
"""ambient"""
"""ambient"""
"""ambient"""
"""anime"""
"""club"""
"""club"""
"""club"""
"""hardcore"""
"""hardcore"""


In [4]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(queried_genres.to_numpy().flatten())
print(tfidf_matrix.shape)

(517, 390)


In [5]:
importance = np.zeros((1, tfidf_matrix.shape[0]))
for i in range(0, queried_songs.shape[0]):
	ans = cosine_similarity(tfidf_matrix[i:i+1], tfidf_matrix)
	importance += ans

importance

array([[4.        , 4.        , 4.        , 4.        , 1.        ,
        3.        , 3.        , 3.        , 2.        , 2.        ,
        1.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [6]:
best_rec_genres = pl.DataFrame({
    'track_genre': queried_genres['track_genre'],
    'importance': importance.T.flatten()
}).unique()

best_rec_genres = best_rec_genres.sort('importance', descending=True)

track_genre,importance
str,f64
"""ambient""",4.0
"""ambient pop""",3.282549
"""club""",3.0
"""ambient house""",2.999045
"""dark ambient""",2.673195
"""ambient folk""",2.673195
"""ambient techno...",2.630618
"""ambient idm""",2.449568
"""compositional ...",2.351546
"""hardcore""",2.0


All genres of non-zero importance are printed below. As one can see, those actually seem on point. Song selection process is presented in the following notebook for I wanted to showcase both parts of the engine in distinct workspaces.

In [10]:
display(best_rec_genres.filter(pl.col('importance') > 0))
display(best_rec_genres[0:56, 'track_genre'].to_numpy().flatten())

track_genre,importance
str,f64
"""ambient""",4.0
"""ambient pop""",3.282549
"""club""",3.0
"""ambient house""",2.999045
"""dark ambient""",2.673195
"""ambient folk""",2.673195
"""ambient techno...",2.630618
"""ambient idm""",2.449568
"""compositional ...",2.351546
"""hardcore""",2.0


array(['ambient', 'ambient pop', 'club', 'ambient house', 'dark ambient',
       'ambient folk', 'ambient techno', 'ambient idm',
       'compositional ambient', 'hardcore', 'deconstructed club',
       'hardcore punk', 'melodic hardcore', 'digital hardcore', 'anime',
       'hip-hop', 'j-dance', 'dance', 'hip hop', 'indie hip hop',
       'dance pop', 'anime rock', 'alternative hip hop',
       'alternative dance', 'polish hip hop', 'australian dance',
       'experimental hip hop', 'uk dance', 'german hip hop',
       'canadian hip hop', 'detroit hip hop', 'underground hip hop',
       'abstract hip hop', 'psychedelic hip hop', 'birmingham hip hop',
       'boston hip hop', 'indonesian hip hop', 'anime score',
       'atl hip hop', 'conscious hip hop', 'miami hip hop',
       'mexican hip hop', 'lgbtq+ hip hop', 'kentucky hip hop',
       'desi hip hop', 'milwaukee hip hop', 'minnesota hip hop',
       'chinese hip hop', 'anime lo-fi', 'uk alternative hip hop',
       'deep german hi