# Recommender System

This notebook contains a artist recommender system.
The system use both content and collaborative filtering for the recommendation.  
I trained the model using the last.fm dataset and a noval dataset I scraped from the last.fm website.  
The data contained musice data starting in the year [] up until [].  
For this reason some recommendation may be outdated.

The structure of this notebook is as following:

1. Functions and definitions.
2. Recommender system.

If you would like to try the recommender system you can skip to the section section.  
[Start of Recommendation System](#Start-of-Recommendation-System)

**The structure of the system**

1. Input your genres of choice from a list of the most popular.
2. Rate these genres.
3. Initial recommendation based on genres, returns popular artists.
4. From the recommendation you can pick artists which you like.
5. Add as many more artists as like that aren't on the list.
6. Rate them, similar to the rating for genres.
7. Content based recommendation from the genres you rated and the list of artists you liked.
8. Collaborative filtering recommendation based on artists you liked.
9. Recommendation based on a combination of the two models.

In [1]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (24, 16)
import ipywidgets as widgets
from IPython.display import display, clear_output

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import time

from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate

In [2]:
top_tags = pd.read_csv("data/processed_data/top_tags.csv")


top_tag_count = pd.DataFrame(
    pd.DataFrame(
    top_tags.drop(columns="artistID"
    ).values.flatten()).value_counts(
    ).head(600)).rename(columns={0:"Count"}
    )
top_tag_count.index = top_tag_count.index.get_level_values(0) 

artist = pd.read_csv("data/artists.dat", sep="\t")

In [3]:
def create_multipleChoice_widget(description, options):
    
    radio_options = [(words, i) for i, words in enumerate(options[:20])]
    alternativ = widgets.SelectMultiple(
        options = radio_options,
        description = '',
        disabled = False
    )
    
    description_out = widgets.Output()
    with description_out:
        print(description)
        
    feedback_out = widgets.Output()

    def check_selection(b):
        genre_index = [val for val in alternativ.index]
        
        with feedback_out:
            clear_output()
            print(f"You selected {len(genre_index)}, continue if correct.")
        return genre_index
    
    check = widgets.Button(description="submit")
    check.on_click(check_selection)
    
    return widgets.VBox([description_out, alternativ, check, feedback_out])

def create_genre_sliders(top_tag_count, Genres):
    return create_sliders(top_tag_count.iloc[list(Genres.children[1].index)].index)

def create_artist_sliders(artist_list):
    return create_sliders(artist_list)

def create_sliders(slider_list):
    sliders = []
    for sliderName in slider_list:
        sliders.append(widgets.SelectionSlider(
        options=['okay', 'good', 'great', 'amazing'],
        value='okay',
        description=f'{sliderName.capitalize()} is',
        disabled=False,
        continuous_update=False,
        orientation='horizontal',
        readout=True
    )
    )
        

    return widgets.HBox([widgets.VBox(sliders)])

In [4]:
def create_initial_recommendation(top_genres, genre_list, n_artists=20):
    user_artists = pd.read_csv("data/user_artists.dat", "\t")
    artist = pd.read_csv("data/artists.dat", sep="\t")
    
    top_artist_for_my_genres = top_genres.melt(id_vars="artistID").set_index(
        "value").loc[[genre.strip() for genre in genre_list[0].split(",")]]["artistID"].unique()

    initial_recommendation_idx = user_artists.set_index("artistID").loc[top_artist_for_my_genres].reset_index(
    ).groupby("artistID").count().sort_values("userID",ascending=False).head(n_artists).index

    initial_recommendation = artist.set_index("id").loc[initial_recommendation_idx]["name"].values
    
    return initial_recommendation

def create_genre_list(Genres_selector, top_genre_count):
    return list(top_genre_count.iloc[list(Genres_selector.children[1].index)].index)

In [5]:
#Cosine
def create_content_based_recommendation(genre_ratings, genre_list, top_tags, new_genre_tags, artist_list):
    
    genre_sentence = create_rating_sentence(genre_ratings, genre_list)
    genre_sentence = genre_sentence + ", " + new_genre_tags
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    
    artist = pd.read_csv("data/artists.dat", sep="\t")
    full_genre_embeddings = pd.read_csv("data/processed_data/genre_embeddings.csv").to_numpy().astype("float32")
    
    cos_sim_data = pd.DataFrame(cosine_similarity(full_genre_embeddings))
    
    most_common_results = []
    for current_index in artist[artist["name"].isin(artist_list)].index:
        index_recomm = cos_sim_data.loc[current_index].sort_values(ascending=False).index.tolist()[1:21]
        for indx in index_recomm:
            most_common_results.append(indx)
        
    top_artist_index = list(pd.DataFrame(
    pd.DataFrame(most_common_results).value_counts(
    ), columns=["Count"]).sort_values(by="Count", ascending=False
    ).index.get_level_values(0))
    
    
    top_artists = artist.loc[top_artist_index]
    top_artists= top_artists[~top_artists['name'].isin(artist_list)]
    return top_artists.head(40).reset_index(drop=True).drop(columns=["pictureURL", "id"])

In [6]:
#Dot Product
def create_content_based_recommendation(genre_ratings, genre_list, top_tags, new_genre_tags, artist_list):
    
    genre_sentence = create_rating_sentence(genre_ratings, genre_list)
    genre_sentence = genre_sentence + ", " + new_genre_tags
    model = SentenceTransformer('distilbert-base-nli-mean-tokens')
    
    artist = pd.read_csv("data/artists.dat", sep="\t")
    full_genre_embeddings = pd.read_csv("data/processed_data/genre_embeddings.csv").to_numpy().astype("float32")
    
    artists_in_top_20_tags = top_tags.melt(id_vars="artistID").set_index(
    "value")["artistID"].unique()
    
    my_embeddings = model.encode(genre_sentence)
    
    genre_score = util.dot_score(full_genre_embeddings, my_embeddings)
    
    top_recommended = pd.DataFrame(genre_score.numpy(), columns=["sim_score"]).sort_values(by="sim_score", ascending=False).index
    
    top_artists = artist.set_index("id").loc[artists_in_top_20_tags[top_recommended]]
    top_artists = top_artists[~top_artists['name'].isin(artist_list)]
    return top_artists.head(20).reset_index(drop=True).drop(columns=["pictureURL"])
    
def create_rating_sentence(genre_ratings, genre_list):
    ratings_list = []
    weight_conversion = {"okay":3, "good":4, "great":5, "amazing":6}
    for c in genre_ratings.children:
        for cc in c.children:
            ratings_list.append(weight_conversion[cc.value])
    sentence = []
    for genre, weight in zip(genre_list, ratings_list):
        sentence.append(", ".join([genre]*weight))
    
    return ", ".join(sentence)

In [7]:
def add_new_artists(artist_list, extra_artists, artist):
    for artistName in extra_artists:
        if len(artist[artist["name"] == artistName]) == 0:
            print(f"It looks like the artist: {artistName} is not in the dataset.")
        elif len(artist[artist["name"] == artistName]) == 1:
            artist_list.append(artistName)
        else:
            print(f"Seems like something went wrong, the artist return this")
            print(artist[artist["name"] == artistName])
    return list(set(artist_list))

In [8]:
def get_top_n(userID, unseen_artists, algo, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = {"artistID":[],"userID":[],"value":[],}
    for iid in unseen_artists:
        top_n["artistID"].append(iid)
        top_n["userID"].append(userID)
        top_n["value"].append(algo.predict(uid=userID, iid=iid).est)

    # Then sort the predictions for each user and retrieve the k highest ones.

    return top_n

In [31]:
def create_collab_model(artist_list, artist_ratings, my_ID = 2101):
    
    df, unseen_artists = create_user_dataset(artist_list, artist_ratings,my_ID)
    reader = Reader(rating_scale=(0, 1))
    data = Dataset.load_from_df(df[['artistID', 'userID', 'value']], reader)
    
    svd = SVD(verbose=False, n_epochs=10)
    cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=False)
    
    trainset = data.build_full_trainset()
    svd.fit(trainset)
    
    pred = get_top_n(my_ID, unseen_artists, svd)
    pred_ids = pd.DataFrame(pred).sort_values(by="value", ascending=False)["artistID"][:20].values
    return artist[artist["id"].isin(pred_ids)]
    
def get_top_n(userID, unseen_artists, algo):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = {"artistID":[],"userID":[],"value":[],}
    for iid in unseen_artists:
        top_n["artistID"].append(iid)
        top_n["userID"].append(userID)
        top_n["value"].append(algo.predict(uid=userID, iid=iid).est)

    # Then sort the predictions for each user and retrieve the k highest ones.

    return top_n 

In [32]:
def create_user_dataset(artist_list, artist_ratings,my_ID):
    
    artist = pd.read_csv("data/artists.dat", sep="\t")
    my_artists = list(artist[artist["name"].isin(artist_list)]["id"].values)
    
    my_ratings = []
    for current_rating in artist_ratings.children:
        for current_value in current_rating.children:
            my_ratings.append(current_value.value)
    
   
    my_dict = {"artistID": my_artists,
              "userID": [my_ID]*(len(my_ratings)),
               "value": my_ratings,
              }
    
    remove_smallIDs = remove_small_users_artists()
    unseen_artists = remove_smallIDs["artistID"][~remove_smallIDs["artistID"].isin(my_artists)].unique()

    remove_smallIDs = remove_smallIDs.append(pd.DataFrame(my_dict)).reset_index(drop=True)
    matrix = remove_smallIDs.pivot(index ='artistID', columns='userID', values='value').fillna(0)
    test_df =(((matrix - matrix.min()) / (matrix.max() - matrix.min())))
    normalized_df = test_df.melt(ignore_index=False).reset_index().dropna().reset_index(drop=True)
    
    return normalized_df, unseen_artists

def remove_small_users_artists():
    user_artist_w = pd.read_csv("data/user_artists.dat", sep="\t")
    user_artist_w.rename(columns={"weight":"value"},inplace=True)
    remove_smallIDs = user_artist_w[(
        user_artist_w["userID"].isin(
        user_artist_w["userID"].value_counts(
        )[(user_artist_w["userID"].value_counts(
        ) > 10)].index)) &(
        user_artist_w["artistID"].isin(
        user_artist_w["artistID"].value_counts(
        )[(user_artist_w["artistID"].value_counts(
        ) > 10)].index)
        )

    ].reset_index(drop=True)
    
    return remove_smallIDs

## Start of Recommendation System

Run the cell below then pick the genres which you like and listen to.  
To select more than one option hold down ctrl, when done press submit.

In [9]:
Genres_selector = create_multipleChoice_widget('Pick Your Favorite Genres Below \nI Suggest To Pick 3 or More',
                                               list(top_tag_count.index))

In [10]:
display(Genres_selector)

VBox(children=(Output(), SelectMultiple(options=(('rock', 0), ('electronic', 1), ('pop', 2), ('indie', 3), ('f…

Now that you have selected your favourite genres, it is is time to rate them.  
When done rating run the cells below that.

In [13]:
genre_ratings = create_genre_sliders(top_tag_count, Genres_selector)
display(genre_ratings)

HBox(children=(VBox(children=(SelectionSlider(continuous_update=False, description='Hip-hop is', options=('oka…

In [14]:
genre_list = create_genre_list(Genres_selector, top_tag_count)
initial_recommendation = create_initial_recommendation(top_tags, genre_list, n_artists=20)

pd.DataFrame(initial_recommendation, columns=["artist_name"]).T

  user_artists = pd.read_csv("data/user_artists.dat", "\t")


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
artist_name,Black Eyed Peas,Eminem,3OH!3,Justin Timberlake,Kanye West,Gorillaz,Ciara,Nicole Scherzinger,Timbaland,Fergie,Usher,Nicki Minaj,Jay-Z,Destiny's Child,M.I.A.,Chris Brown,Keri Hilson,Taio Cruz,B.o.B,T.I.


This is the initial recommendation based on the genres you like.

Similar to when you had to pick the genres, it is time to pick your favourite artists.

In [15]:
artist_selector = create_multipleChoice_widget('Pick Your Favorite Artists From Below \n',initial_recommendation)

display(artist_selector)

VBox(children=(Output(), SelectMultiple(options=(('Black Eyed Peas', 0), ('Eminem', 1), ('3OH!3', 2), ('Justin…

### Add extra artists

To help the model recommender more accurate artists you can add some extra data in this section.  
One probelm you may face is outdated artists and recommendation, as the recommender model is trained using data from 2010 and earlier.   

In [16]:
artist_list = list(initial_recommendation[list(artist_selector.children[1].index)])

In [18]:
extra_artists = ["The Alchemist", "Nas", "Mac Miller", 
                 "A Tribe Called Quest", "Travis Scott", 
                 "MF DOOM", "Kid Cudi", "Slum Village", 
                 "Common", "Ice Cube"]
artist_list = add_new_artists(artist_list, extra_artists, artist)

It looks like the artist: Travis Scott is not in the dataset.


In [19]:
artist_ratings = create_artist_sliders(artist_list)
display(artist_ratings)

HBox(children=(VBox(children=(SelectionSlider(continuous_update=False, description='Nas is', options=('okay', …

In [20]:
new_genre_tags = ", ".join(list(top_tags.set_index("artistID"
            ).loc[artist.set_index("name"
            ).loc[artist_list]["id"].values].reset_index(
            ).melt(id_vars="artistID")["value"].values))

### Better Recommendations

In the next section I create a better recommendation using both;
1. Content based.
2. Collabrotive filitering

In [21]:
# Content based filtering using the genres which you like and genres realted to artists you like.
create_content_based_recommendation(genre_ratings, genre_list, top_tags, new_genre_tags, artist_list)

Unnamed: 0,name,url
0,ANONTRON,http://www.last.fm/music/ANONTRON
1,Count Bass D,http://www.last.fm/music/Count+Bass+D
2,The Pharcyde,http://www.last.fm/music/The+Pharcyde
3,Lea Michele and Idina Menzel,http://www.last.fm/music/Lea+Michele+and+Idina...
4,Reflection Eternal,http://www.last.fm/music/Reflection+Eternal
5,Black Star,http://www.last.fm/music/Black+Star
6,Nicki Minaj,http://www.last.fm/music/Nicki+Minaj
7,Will Smith,http://www.last.fm/music/Will+Smith
8,Ugly Duckling,http://www.last.fm/music/Ugly+Duckling
9,Diddy - Dirty Money,http://www.last.fm/music/Diddy+-+Dirty+Money


In [33]:
# Collabrotive filtering using the artits that you like and the rating given by you.
create_collab_model(artist_list, artist_ratings, my_ID = 2101)

  remove_smallIDs = remove_smallIDs.append(pd.DataFrame(my_dict)).reset_index(drop=True)
  test_df =(((matrix - matrix.min()) / (matrix.max() - matrix.min())))
  test_df =(((matrix - matrix.min()) / (matrix.max() - matrix.min())))
  test_df =(((matrix - matrix.min()) / (matrix.max() - matrix.min())))
  pred_ids = pd.DataFrame(pred).sort_values(by="value", ascending=False)["artistID"][:20].values


Unnamed: 0,id,name,url,pictureURL
293,299,Jennifer Lopez,http://www.last.fm/music/Jennifer+Lopez,http://userserve-ak.last.fm/serve/252/49596953...
410,416,The National,http://www.last.fm/music/The+National,http://userserve-ak.last.fm/serve/252/51042665...
420,426,Rilo Kiley,http://www.last.fm/music/Rilo+Kiley,http://userserve-ak.last.fm/serve/252/5029.jpg
450,456,Jesse McCartney,http://www.last.fm/music/Jesse+McCartney,http://userserve-ak.last.fm/serve/252/54340985...
457,463,Savage Garden,http://www.last.fm/music/Savage+Garden,http://userserve-ak.last.fm/serve/252/45487441...
687,693,Hey Monday,http://www.last.fm/music/Hey+Monday,http://userserve-ak.last.fm/serve/252/29450373...
722,728,Eric Clapton,http://www.last.fm/music/Eric+Clapton,http://userserve-ak.last.fm/serve/252/2366379.jpg
845,854,Alice in Chains,http://www.last.fm/music/Alice+in+Chains,http://userserve-ak.last.fm/serve/252/3373415.jpg
1028,1037,Nicki Minaj,http://www.last.fm/music/Nicki+Minaj,http://userserve-ak.last.fm/serve/252/61385595...
1178,1187,Dance Gavin Dance,http://www.last.fm/music/Dance+Gavin+Dance,http://userserve-ak.last.fm/serve/252/57828937...
