# Music Recommendation using Doc2Vec

Based on music lyrics, you can get similar artist recommendations, find an alternative song with the same great lyrics you already like or just see how much the epochs of music history are related to each other.
Moreover, you can also look into the most significant words of your favorite artist and find a song that suits your mood.

For more appropriate results, a separate model should be trained for each language (dataset consists of lyrics in various languages).

Hackathon at Starmind 29th of March 2018
by Alex Flückiger,  Yulia Nigmatulina

In [1]:
import pandas as pd
import numpy as np
from gensim.utils import tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

In [2]:
class ArtistRecommendation():
    def __init__(self, dataset):
        self.df = self.read_dataset(dataset)
        self.taggeddocs = None
        self.model = None
        self.stopwords = self._set_stopwords()
        
        
    def read_dataset(self, dataset):
        df = pd.read_csv(dataset)
        df['year'] = df['year'].fillna(0.0).astype(int)
        return df
        
    def _set_stopwords(self):
            return set(stopwords.words('english'))

    def tag_docs(self):
        self.taggeddocs = list()
        
        for index, row in self.df.iterrows():
            #print(row)
            song = row['song'].lower() + '_song'
            artist = row['artist'].lower() + '_artist'
            tags = [artist, song]
            
            if type(row['year']) == int and row['year'] != 0:
                year = str(row['year'])[:3] + '0_year'
                tags.append(year)
            
            if type(row['genre']) == str:
                genre = row['genre'].replace('_', '-').lower() + '_genre'
                tags.append(genre)
                
            tokens = list(tokenize(row['text'], lowercase=True, deacc=False))
            
            self.taggeddocs.append(TaggedDocument(words=tokens, tags=tags))
    
    def train_model(self):
        # For a better performance models PV-DBOW and PV-DM should be combined as proposed in the orginal paper by Mikolov
        # PV-DBOW: random sampled words are predicted per tag
        # PV-DM: tag can be considered as an additional word in the respective word window

        dmm_model = Doc2Vec(self.taggeddocs, dm=1, dm_mean=1, vector_size=100, window=10, min_count=2, negative=5, hs=0, workers=4, epochs=20)
        self.model = dmm_model
        # dbow_model = Doc2Vec(self.taggeddocs, dm=0, vector_size=100, window=5, min_count=2, negative=5, workers=4, epochs=20)
        # self.model = ConcatenatedDoc2Vec([dmm_model, dbow_model])
        
    def compute_embedding_per_artist(self):
        for artist in set(df['artist']):
            artist = artist.lower()
            self.df[self.df['artist'] == artist]['artist_embedding'] = np.mean(
                self.df[self.df['artist'] == artist]['doc_embedding'])

    def get_artist_similarity(self, artist, genre=None, year=None):
        artist = artist.lower() + '_artist'
        return [(tag.split('_')[0], score) for tag, score in self._get_similarity_by_tag(artist) if '_artist' in tag]
        
    
    def get_song_similarity(self, song, genre=None, year=None):
        title = song
        # include the artist information
        #recommender.df[df['song'].str.match(title, case=False)]['artist'].to_string(),
        song = song.lower() + '_song'
        return [(tag.split('_')[0], score) for tag, score in self._get_similarity_by_tag(song) if '_song' in tag]
    
    def get_year_similarity(self, year):
        year = str(year) + '_year'
        return [(tag.split('_')[0], score) for tag, score in self._get_similarity_by_tag(year) if '_year' in tag]
    
    def get_genre_similarity(self, genre):
        genre = str(genre) + '_genre'
        return [(tag.split('_')[0], score) for tag, score in self._get_similarity_by_tag(genre) if '_genre' in tag]

    def get_word_similarity(self, word):
        return self.model.wv.most_similar(word)
    
    def _get_similarity_by_tag(self, tag):
        vect = self.model.docvecs[tag]
        return self.model.docvecs.most_similar([vect], topn=30)
    
    def get_protoypical_words_per_artist(self, artist):
        artist = artist.lower() + '_artist'
        vect = self.model.docvecs[artist]
        words = recommender.model.wv.similar_by_vector(vect, topn=20)
        return [w for w in words if w[0] not in self.stopwords]

    def find_song_by_word(self, word):
        word_distribution = {}
        word_cloud = self.get_word_similarity(word)
        for song in self.taggeddocs:
            word_prob = self.word_prob(word, song[0])
            if song[1][1] not in word_distribution:
                word_distribution["{} - {}".format(song[1][0], song[1][1])] = word_prob
            else:
                word_distribution[song[1][1]] = word_distribution["{} - {}".format(song[1][0], song[1][1])] + word_prob
                
        for song, prob in word_distribution.items():
            word_distribution[song] = prob/len(word_cloud)

        return sorted(word_distribution.items(), key=lambda x:x[1], reverse=True)[:5]
        
    def word_prob(self, word, song):
        word_n = 0
        for w in song:
            if w == word:
                word_n += 1
        return word_n/len(self.taggeddocs)

In [3]:
recommender = ArtistRecommendation('dataset.csv')
recommender.tag_docs()
recommender.train_model()

In [4]:
recommender.df.head()

Unnamed: 0,artist,song,text,genre,year
0,ABBA,Ahe's My Kind Of Girl,"Look at her face, it's a wonderful face \nAnd...",,0
1,ABBA,"Andante, Andante","Take it easy with me, please \nTouch me gentl...",,0
2,ABBA,As Good As New,I'll never know why I had to go \nWhy I had t...,,0
3,ABBA,Bang,Making somebody happy is a question of give an...,,0
4,ABBA,Bang-A-Boomerang,Making somebody happy is a question of give an...,Pop_Rock,1975


In [5]:
 recommender.get_artist_similarity('eminem')

[('eminem', 1.000000238418579),
 ('insane clown posse', 0.7843844294548035),
 ('outkast', 0.7550121545791626),
 ('wu-tang clan', 0.7418571710586548),
 ('yelawolf', 0.7282025814056396),
 ('ice cube', 0.727891743183136),
 ('fabolous', 0.7164888381958008),
 ('xzibit', 0.7152565121650696),
 ('notorious b.i.g.', 0.7092170119285583),
 ('nicki minaj', 0.6994189620018005),
 ('x-raided', 0.6969919204711914),
 ('quarashi', 0.6964083909988403),
 ('yg', 0.6911574602127075),
 ('vanilla ice', 0.6703895926475525),
 ('lil wayne', 0.667458176612854),
 ("ultramagnetic mc's", 0.6670665144920349),
 ('puff daddy', 0.6633135676383972),
 ('ying yang twins', 0.6614609360694885),
 ('kanye west', 0.6522127985954285),
 ('snoop dogg', 0.6510710716247559),
 ('drake', 0.6462624669075012),
 ('ll cool j', 0.6428661346435547),
 ('everlast', 0.6346815824508667),
 ('yukmouth', 0.6344373226165771),
 ('wiz khalifa', 0.630338191986084),
 ('kid rock', 0.6290221214294434),
 ('sublime', 0.6253687739372253),
 ('will smith', 0.

In [6]:
recommender.get_song_similarity('bang')

[('bang', 1.0000001192092896),
 ('bang-a-boomerang', 0.484915554523468),
 ('ice ice baby', 0.4771422743797302),
 ('wig master', 0.4459804594516754),
 ('too cold', 0.431679368019104),
 ("we don't care bout ya", 0.3665222227573395),
 ('hippie dream', 0.35457414388656616),
 ('the greatest rapper', 0.3516010642051697),
 ('go no more a-roving', 0.3407108187675476),
 ('delivering the goods', 0.3395070433616638),
 ('the huckle-buck', 0.33772966265678406),
 ('all i have to offer you is love', 0.33746570348739624),
 ('legendary lovers', 0.3335992097854614),
 ("that's when i'll give up", 0.33348798751831055),
 ('give up the funk', 0.3319513499736786),
 ('fist city', 0.33010223507881165),
 ("i'll try something new", 0.32870179414749146),
 ('i should have known better', 0.3265601694583893),
 ('cold war', 0.32617875933647156),
 ("rollin' wit the lench mob", 0.32520508766174316),
 ('love revival', 0.32300570607185364),
 ('blood red skies', 0.32124173641204834),
 ('never', 0.3212059736251831),
 ("cru

In [7]:
recommender.find_song_by_word('love')

[("madonna_artist - it's so cool_song", 0.00021335646140503033),
 ("gucci mane_artist - i think i'm in love_song", 0.00014570685169124024),
 ('stevie wonder_artist - all day sucker_song', 0.00011795316565481353),
 ('alabama_artist - love remains_song', 0.00011621856027753685),
 ('kiss_artist - do you love me_song', 0.00011621856027753685)]

In [8]:
recommender.find_song_by_word('hate')

[('fatboy slim_artist - drop the hate_song', 8.499566348655681e-05),
 ('offspring_artist - cool to hate_song', 4.336513443191674e-05),
 ('misfits_artist - hate breeders_song', 3.816131830008673e-05),
 ('glee_artist - hate on me_song', 3.642671292281006e-05),
 ('kinks_artist - hatred_song', 3.642671292281006e-05)]

In [18]:
recommender.get_word_similarity('joy')

[('gladness', 0.5859478116035461),
 ('sorrow', 0.5761454701423645),
 ('joys', 0.5740044116973877),
 ('sadness', 0.5706560611724854),
 ('happiness', 0.56773841381073),
 ('latigo', 0.49610090255737305),
 ('laughter', 0.48559334874153137),
 ('wedlock', 0.4499773681163788),
 ('cheer', 0.44746074080467224),
 ('springtime', 0.44741344451904297)]

In [10]:
recommender.get_protoypical_words_per_artist('metallica')

[('mutilation', 0.6141511797904968),
 ('geddon', 0.6138846278190613),
 ('inquisitions', 0.6079074144363403),
 ('satanas', 0.5923866629600525),
 ('impian', 0.5912560224533081),
 ('sterne', 0.5862410068511963),
 ('mawarku', 0.5804999470710754),
 ('memahami', 0.5771117210388184),
 ('rage', 0.5733351111412048),
 ('canzone', 0.5730963349342346),
 ('genocide', 0.5720162987709045),
 ('denwa', 0.5709301829338074),
 ('blut', 0.5627501606941223),
 ('maafkan', 0.5607697367668152),
 ('kesem', 0.5593931078910828),
 ('nanti', 0.5567923188209534),
 ('kasteel', 0.5553518533706665),
 ('goshi', 0.5549077987670898),
 ('injector', 0.5546765923500061),
 ('nayyan', 0.5546014904975891)]

In [11]:
recommender.get_year_similarity(1990)

[('1990', 1.0),
 ('1960', 0.5753753781318665),
 ('1970', 0.539879322052002),
 ('1980', 0.48252999782562256)]

In [12]:
recommender.get_genre_similarity('pop-rock')

[('pop-rock', 0.9999997019767761), ('folk', 0.5284911394119263)]

In [13]:
# show prototypical artists of an genre
vect = recommender.model.docvecs['country_genre']
recommender.model.docvecs.most_similar([vect], topn=10)

[('country_genre', 1.0000001192092896),
 ('tim mcgraw_artist', 0.6915132999420166),
 ('alabama_artist', 0.6814668774604797),
 ('george strait_artist', 0.676322877407074),
 ('randy travis_artist', 0.669689953327179),
 ('garth brooks_artist', 0.6577267050743103),
 ('kenny chesney_artist', 0.6451531648635864),
 ('george jones_artist', 0.6439744830131531),
 ('vince gill_artist', 0.6417198777198792),
 ('hank williams jr._artist', 0.6349629759788513)]

In [14]:
# show prototypical artists of an epoch in music history
vect = recommender.model.docvecs['1990_year']
recommender.model.docvecs.most_similar([vect], topn=10)

[('1990_year', 1.0),
 ('pop-rock_genre', 0.6357326507568359),
 ('black sabbath_artist', 0.6074845790863037),
 ('overkill_artist', 0.5767002105712891),
 ('1960_year', 0.5753753781318665),
 ('1970_year', 0.539879322052002),
 ('iron maiden_artist', 0.5263309478759766),
 ('deep purple_artist', 0.517450749874115),
 ('ozzy osbourne_artist', 0.512157678604126),
 ('oasis_artist', 0.5055568814277649)]