<a href="https://colab.research.google.com/github/lovrodukic/music-recommendation/blob/main/notebooks/recommender_als.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing

Preprocess Last.fm dataset to prepare it for building a recommendation system.

In [7]:
!wget -P /content/datasets https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
!unzip /content/datasets/hetrec2011-lastfm-2k.zip -d /content/datasets
!ls /content/datasets
# Install required libraries
!pip install pandas numpy scikit-learn matplotlib implicit

--2024-11-13 22:21:26--  https://files.grouplens.org/datasets/hetrec2011/hetrec2011-lastfm-2k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2589075 (2.5M) [application/zip]
Saving to: ‘/content/datasets/hetrec2011-lastfm-2k.zip.1’


2024-11-13 22:21:27 (12.1 MB/s) - ‘/content/datasets/hetrec2011-lastfm-2k.zip.1’ saved [2589075/2589075]

Archive:  /content/datasets/hetrec2011-lastfm-2k.zip
replace /content/datasets/user_friends.dat? [y]es, [n]o, [A]ll, [N]one, [r]ename: artists.dat		    readme.txt	      user_friends.dat
hetrec2011-lastfm-2k.zip    tags.dat	      user_taggedartists.dat
hetrec2011-lastfm-2k.zip.1  user_artists.dat  user_taggedartists-timestamps.dat
Collecting implicit
  Using cached implicit-0.7.2-cp310-cp310-manylinux2014_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp310-cp310-manylinux201

In [133]:
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import numpy as np
import pandas as pd
import scipy

In [134]:
def load_user_artists(user_artists_file):
    """
    Return a CSR matrix of user_artistst.dat
    """
    user_artists = pd.read_csv(user_artists_file, sep='\t')
    user_artists.set_index(['userID', 'artistID'], inplace=True)
    coo = scipy.sparse.coo_matrix(
        (
            user_artists.weight.astype(float),
            (
                user_artists.index.get_level_values(0),
                user_artists.index.get_level_values(1),
            ),
        )
    )

    return coo.tocsr()

user_artists = load_user_artists('/content/datasets/user_artists.dat')
print(f"Sparse matrix shape: {user_artists.shape}")

Sparse matrix shape: (2101, 18746)


In [135]:
def load_artists(artists_file):
    """
    Load artists and return in a dataframe format
    """
    artists = pd.read_csv(artists_file, sep='\t')
    artists = artists.set_index('id')

    return artists

artists = load_artists('/content/datasets/artists.dat')
print(f"Dataframe shape: {artists.shape}")

Dataframe shape: (17632, 3)


# Model Training

Training a collaborative filtering model using Alternating Least Squares (ALS)

In [138]:
from implicit import als

model = als.AlternatingLeastSquares(
    factors=200,
    regularization=0.05,
    iterations=50
)

model.fit(user_artists)
print("Training complete.")

  0%|          | 0/50 [00:00<?, ?it/s]

Training complete.


# Model Evaluation

Evaluate the performance of the model using precision and recall

In [139]:
def get_als_recommendations(model, user_id, user_artists, n_recommendations=5):
    """
    Generate top n recommendations for a specific user using the ALS model.
    """
    recommended_items, scores = model.recommend(
        userid=user_id,
        user_items=user_artists[n_recommendations],
        N=n_recommendations
    )

    recommendations = [
        artists.loc[artist_id, 'name'] for artist_id in recommended_items
    ]

    return recommendations, scores

user_id = 2
recommendations, scores = get_als_recommendations(
    model, user_id, user_artists, n_recommendations=5
)

for (artist, score) in zip(recommendations, scores):
    print(f"{artist}: {score}")

Eurythmics: 1.0763423442840576
Simple Minds: 1.0432944297790527
The Human League: 1.0092244148254395
Kylie Minogue: 1.0034329891204834
New Order: 1.003252387046814


In [141]:
def mean_percentile_ranking(model, user_artists, k=10):
    """
    Calculate Mean Percentile Ranking (MPR) for the ALS model
    """
    num_users = user_artists.shape[0]
    mpr_sum = 0
    num_evaluated_users = 0

    user_items = user_artists

    for user_id in range(num_users):
        user_interacted_items = user_items[user_id].indices
        if len(user_interacted_items) == 0:
            continue

        # Get all items ranked by the model for this user
        recommended_items, scores = model.recommend(
            userid=user_id,
            user_items=user_items,
            N=user_items.shape[1],
            filter_already_liked_items=False
        )

        # Calculate the rank of each item the user interacted with
        ranks = np.argsort(np.argsort(-scores))
        user_mpr = np.mean([ranks[item] / len(scores) for item in user_interacted_items if item in recommended_items])

        mpr_sum += user_mpr
        num_evaluated_users += 1

    # Calculate the average MPR across all users
    avg_mpr = mpr_sum / num_evaluated_users if num_evaluated_users > 0 else 0

    return avg_mpr

avg_mpr = mean_percentile_ranking(model, user_artists)
print(f"Mean Percentile Ranking (MPR): {avg_mpr:.4f}")

Mean Percentile Ranking (MPR): 0.1805


In [153]:
def map_at_k(model, user_artists, k=5):
    """
    Calculate Mean Average Precision
    """
    num_users = user_artists.shape[0]
    map_sum = 0
    num_evaluated_users = 0

    user_items = user_artists

    for user_id in range(num_users):
        user_interacted_items = user_items[user_id].indices
        if len(user_interacted_items) == 0:
            continue

        # Generate top-k recommendations for the user
        recommended_items, _ = model.recommend(userid=user_id, user_items=user_items[k], N=k)
        relevant_items_set = set(user_interacted_items)

        # Calculate Average Precision
        hits = 0
        sum_precision = 0
        for i, item in enumerate(recommended_items):
            if item in relevant_items_set:
                hits += 1
                sum_precision += hits / (i + 1)

        if len(relevant_items_set) > 0:
            average_precision = sum_precision / min(len(relevant_items_set), k)
        else:
            average_precision = 0

        map_sum += average_precision
        num_evaluated_users += 1

    avg_map = map_sum / num_evaluated_users if num_evaluated_users > 0 else 0

    return avg_map

avg_map = map_at_k(model, user_artists, k=5)
print(f"Mean Average Precision (MAP): {avg_map:.4f}")

Mean Average Precision (MAP): 0.8630
