In [25]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[Errno 2] No such file or directory: 'drive/MyDrive'
/content/drive/MyDrive


In [26]:
file_name = 'songdata.csv'

# **1) Content-based filters**
Recommendations done using content-based recommenders can be seen as a user-specific classification problem. This classifier learns the user's likes and dislikes from the features of the song.

The most straightforward approach is **keyword matching.**

In a few words, the idea behind is to extract meaningful keywords present in a song description a user likes, search for the keywords in other song descriptions to estimate similarities among them, and based on that, recommend those songs to the user.

How is this performed?

In our case, because we are working with text and words, **Term Frequency-Inverse Document Frequency (TF-IDF)** can be used for this matching process.

We'll go through the steps for generating a **content-based** music recommender system.

# Importing required libraries
First, we'll import all the required libraries.

In [27]:
import numpy as np
import pandas as pd
from typing import List, Dict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Dataset
So imagine that we have the following dataset.

This dataset contains name, artist, and lyrics for 57650 songs in English. The data has been acquired from LyricsFreak through scraping.

In [28]:
songs = pd.read_csv(file_name)

In [29]:
songs

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...
...,...,...,...,...
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \nLet the angels fly l...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \nMore power \nPower to...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \nis something i'll believe \nf...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \nam i frightened \nwhere can ...


In [30]:
songs.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/a/abba/ahes+my+kind+of+girl_20598417.html,I just came back from a lovely trip along the ...
freq,191,35,1,6


Because of the dataset being so big, we are going to resample only 5000 random songs.

In [31]:
songs = songs.sample(n=5000).drop('link',axis=1).reset_index(drop=True)

In [32]:
songs

Unnamed: 0,artist,song,text
0,Aerosmith,Bacon Biscuit Blues,"Put your biscuits in the oven \nHoney, put yo..."
1,Underoath,Cries Of The Past,It happened all so fast heavy with sleep my ey...
2,Morrissey,At Last I Am Born,At last I am born \nHistorians note \nI am f...
3,Spandau Ballet,Code Of Love,He put so much into her life \nShe took so mu...
4,Clash,Garageland,Back in the garage with my bullshit detector ...
...,...,...,...
4995,Whitesnake,Medicine Man,"You never leave her alone, \nI can see you ne..."
4996,Carly Simon,Lili Marlene,Outside the barracks by the corner light \nDa...
4997,Tina Turner,Land Of 1000 Dances,One two three! \nOne two three! \n \nGotta ...
4998,Faith Hill,The Hard Way,I hear every word they say \nThey tell me to ...


We can notice also the presence of \n in the text, so we are going to remove it.

In [33]:
songs['text'] = songs['text'].str.replace(r'\n','')

  songs['text'] = songs['text'].str.replace(r'\n','')


In [34]:
songs

Unnamed: 0,artist,song,text
0,Aerosmith,Bacon Biscuit Blues,"Put your biscuits in the oven Honey, put your..."
1,Underoath,Cries Of The Past,It happened all so fast heavy with sleep my ey...
2,Morrissey,At Last I Am Born,At last I am born Historians note I am final...
3,Spandau Ballet,Code Of Love,He put so much into her life She took so much...
4,Clash,Garageland,Back in the garage with my bullshit detector ...
...,...,...,...
4995,Whitesnake,Medicine Man,"You never leave her alone, I can see you neve..."
4996,Carly Simon,Lili Marlene,Outside the barracks by the corner light Darl...
4997,Tina Turner,Land Of 1000 Dances,One two three! One two three! Gotta know h...
4998,Faith Hill,The Hard Way,I hear every word they say They tell me to st...


After that, we use TF-IDF vectorizerthat calculates the TF-IDF score for each song lyric, word-by-word.

Here, we pay particular attention to the arguments we can specify.

In [35]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [36]:
lyrics_matrix = tfidf.fit_transform(songs['text'])

In [37]:
lyrics_matrix.shape

(5000, 24391)

In [38]:
(tfidf.get_feature_names_out())

array(['00', '000', '007', ..., 'zuiderzee', 'zulu', 'zutto'],
      dtype=object)

We now need to calculate the similarity of one lyric to another. We are going to use cosine similarity.

We want to calculate the cosine similarity of each item with every other item in the dataset. So we just pass the lyrics_matrix as argument.

In [39]:
cosine_similarities = cosine_similarity(lyrics_matrix) 

In [40]:
cosine_similarities.shape

(5000, 5000)

In [41]:
similarities={}

for i in range(len(cosine_similarities)):
  sims = cosine_similarities[i].argsort()[-51:-1]
  similarities[songs['song'][i]] = [(cosine_similarities[i][x], songs['song'][x], songs['artist'][x]) for x in sims][::-1]

In [42]:
songs

Unnamed: 0,artist,song,text
0,Aerosmith,Bacon Biscuit Blues,"Put your biscuits in the oven Honey, put your..."
1,Underoath,Cries Of The Past,It happened all so fast heavy with sleep my ey...
2,Morrissey,At Last I Am Born,At last I am born Historians note I am final...
3,Spandau Ballet,Code Of Love,He put so much into her life She took so much...
4,Clash,Garageland,Back in the garage with my bullshit detector ...
...,...,...,...
4995,Whitesnake,Medicine Man,"You never leave her alone, I can see you neve..."
4996,Carly Simon,Lili Marlene,Outside the barracks by the corner light Darl...
4997,Tina Turner,Land Of 1000 Dances,One two three! One two three! Gotta know h...
4998,Faith Hill,The Hard Way,I hear every word they say They tell me to st...


In [43]:
cosine_similarities[0].argsort()

array([3397,  234, 3101, ..., 4650, 1882,    0])

In [44]:
songs['song'][44]

'Let Me Be The One You Need'

we'll define our Content based recommender class.

In [46]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, song, recom_song):
        rec_items = len(recom_song)
        
        print(f'The {rec_items} recommended songs for {song} are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_song[i][1]} by {recom_song[i][2]} with {round(recom_song[i][0], 3)} similarity score") 
            print("--------------------")
        
    def recommend(self, recommendation):
        # Get song to find recommendations for
        song = recommendation['song']
        # Get number of songs to recommend
        number_songs = recommendation['number_songs']
        # Get the number of songs most similars from matrix similarities
        recom_song = self.matrix_similar[song][:number_songs]
        # print each item
        self._print_message(song=song, recom_song=recom_song)

In [47]:
recommedations = ContentBasedRecommender(similarities)

In [48]:
recommendation = {
    "song": songs['song'].iloc[0],
    "number_songs": 4 
}

In [49]:
recommedations.recommend(recommendation)

The 4 recommended songs for Bacon Biscuit Blues are:
Number 1:
Alone Together by Fall Out Boy with 0.388 similarity score
--------------------
Number 2:
Fingers by P!nk with 0.372 similarity score
--------------------
Number 3:
Every Sip by LL Cool J with 0.357 similarity score
--------------------
Number 4:
Ain't No Big Deal by Madonna with 0.356 similarity score
--------------------


In [50]:
recommendation2 = {
    "song": songs['song'].iloc[120],
    "number_songs": 4 
}

In [51]:
recommedations.recommend(recommendation2)

The 4 recommended songs for Don't Step On Mother's Roses are:
Number 1:
Guns And Roses by Lana Del Rey with 0.367 similarity score
--------------------
Number 2:
Are All The Children In by Johnny Cash with 0.334 similarity score
--------------------
Number 3:
It Must Have Been The Roses by Grateful Dead with 0.329 similarity score
--------------------
Number 4:
Oh Mother by Christina Aguilera with 0.301 similarity score
--------------------
