# Finding Similar Music Using `Implicit` and Alternating Least Squares (ALS) for Matrix Factorization

This follow's Ben Frederickson's [Finding Similar Music using Matrix Factorization](https://www.benfrederickson.com/matrix-factorization/).

## Preliminaries

**Next two lines are for pretty output for all prints in a Pandas cell, not just the last.**

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

**`DataSci` contains generally helpful data science stuff, while `plotHelpers` includes plot functions specifically.**

In [None]:
import sys
sys.path.append('../../../work/Mlib')
from utility import DataSci as util
from utility import ModelTrain as mt
import plotHelpers as ph

### Python imports

In [None]:
import argparse
import codecs
import logging
import time

import numpy as np
import tqdm

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.lastfm import get_lastfm
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

### Load the datums

* read `em in, then create sparse matrices

In [None]:
data = pd.read_table("usersha1-artmbid-artname-plays.tsv", 
                     usecols=[0, 2, 3], 
                     names=['user', 'artist', 'plays'])

### map each artist and user to a unique numeric value

In [None]:
data['user'] = data['user'].astype("category")
data['artist'] = data['artist'].astype("category")

### create a sparse matrix of all the artist/user/play triples

In [None]:
plays = coo_matrix((data['plays'].astype(float), 
                   (data['artist'].cat.codes, 
                    data['user'].cat.codes)))

In [None]:
# create a dictionary of artist name to the set of their users
artist_sets = dict((artist, set(users)) for artist, users in data.groupby('artist')['user'])


In [1]:
""" An example of using this library to calculate related artists
from the last.fm dataset. More details can be found
at http://www.benfrederickson.com/matrix-factorization/

This code will automically download a HDF5 version of the dataset from
GitHub when it is first run. The original dataset can also be found at
http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html.
"""
import argparse
import codecs
import logging
import time

import numpy as np
import pandas as pd
import tqdm
from scipy.sparse import coo_matrix

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.datasets.lastfm import get_lastfm
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)

# maps command line model argument to class name
MODELS = {"als":  AlternatingLeastSquares,
          "nmslib_als": NMSLibAlternatingLeastSquares,
          "annoy_als": AnnoyAlternatingLeastSquares,
          "faiss_als": FaissAlternatingLeastSquares,
          "tfidf": TFIDFRecommender,
          "cosine": CosineRecommender,
          "bpr": BayesianPersonalizedRanking,
          "lmf": LogisticMatrixFactorization,
          "bm25": BM25Recommender}


def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    elif model_name == "lmf":
        params = {'factors': 30, "iterations": 40, "regularization": 1.5}
    else:
        params = {}

    return model_class(**params)


def calculate_similar_artists(output_filename, model_name="als"):
    """ generates a list of similar artists in lastfm by utilizing the 'similar_items'
    api of the models """
    if smallDataset:
      artists, users, plays = get_lastfm()
    else:
        data = pd.read_table("usersha1-artmbid-artname-plays.tsv", 
                             usecols=[0, 2, 3], 
                             names=['user', 'artist', 'plays'])

        # map each artist and user to a unique numeric value
        data['user'] = data['user'].astype("category")
        data['artist'] = data['artist'].astype("category")
        users = data['user']
        artists = data['artist']
        # create a sparse matrix of all the artist/user/play triples
        plays = coo_matrix([data['plays'].astype(float), 
                            data['artist'].cat.codes, 
                            data['user'].cat.codes])

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_recommend = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # write out similar artists by popularity
    start = time.time()
    logging.debug("calculating top artists")

    user_count = np.ediff1d(plays.indptr)
    print(repr(artists))
    to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x])

    # write out as a TSV of artistid, otherartistid, score
    logging.debug("writing similar items")
    with tqdm.tqdm(total=len(to_generate)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for artistid in to_generate:
                artist = artists[artistid]
                for other, score in model.similar_items(artistid, 11):
                    o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
                progress.update(1)

    logging.debug("generated similar artists in %0.2fs",  time.time() - start)


def calculate_recommendations(output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
    if smallDataset:
      artists, users, plays = get_lastfm()
    else:
        data = pd.read_table("usersha1-artmbid-artname-plays.tsv", 
                             usecols=[0, 2, 3], 
                             names=['user', 'artist', 'plays'])

        # map each artist and user to a unique numeric value
        data['user'] = data['user'].astype("category")
        data['artist'] = data['artist'].astype("category")
        users = data['user']
        artists = data['artist']
        # create a sparse matrix of all the artist/user/play triples
        plays = coo_matrix([data['plays'].astype(float), 
                            data['artist'].cat.codes, 
                            data['user'].cat.codes])

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
        logging.debug("weighting matrix by bm25_weight")
        plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # generate recommendations for each user and write out to a file
    start = time.time()
    user_plays = plays.T.tocsr()
    with tqdm.tqdm(total=len(users)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for userid, username in enumerate(users):
                for artistid, score in model.recommend(userid, user_plays):
                    o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs",  time.time() - start)



In [2]:
# if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Generates similar artists on the last.fm dataset"
                                     " or generates personalized recommendations for each user",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--output', type=str, default='similar-artists.tsv',
                    dest='outputfile', help='output file name')
parser.add_argument('--model', type=str, default='als',
                    dest='model', help='model to calculate (%s)' % "/".join(MODELS.keys()))
parser.add_argument('--recommend',
                    help='Recommend items for each user rather than calculate similar_items',
                    action="store_true")
parser.add_argument('--param', action='append',
                    help="Parameters to pass to the model, formatted as 'KEY=VALUE")

# args = parser.parse_args()
args, unknown = parser.parse_known_args()

logging.basicConfig(level=logging.DEBUG)

if args.recommend:
    calculate_recommendations(args.outputfile, model_name=args.model)
else:
    calculate_similar_artists(args.outputfile, model_name=args.model)

DEBUG:root:weighting matrix by bm25_weight


getting model als


DEBUG:root:training model als
DEBUG:implicit:Calculated transpose in 0.350s
DEBUG:implicit:Initialized factors in 4.252726078033447
DEBUG:implicit:Running 15 ALS iterations


HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))

DEBUG:root:trained model 'als' in 827.72s
DEBUG:root:calculating top artists



0                betty blowtorch
1                      die Ärzte
2              melissa etheridge
3                      elvenking
4           juliette & the licks
                    ...         
17535650              turbostaat
17535651           cuba missouri
17535652         little man tate
17535653               sigur rós
17535654              the smiths
Name: artist, Length: 17535655, dtype: category
Categories (292363, object): [04)], 2, 58725ab=>, 80lİ yillarin tÜrkÇe sÖzlÜ aŞk Şarkilari, ..., ￼beastie boys, ��|, ��疲暎�, �������]


IndexError: index 3 is out of bounds for axis 0 with size 3