#### Przygotowania

Import wymaganych bibliotek

In [191]:
import pandas as pd
import numpy as np
import implicit
from implicit.nearest_neighbours import bm25_weight
from implicit.evaluation import train_test_split
import tqdm
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

Wczytanie danych

In [192]:
train = pd.read_csv('../data/cf/train_rating.csv')
test = pd.read_csv('../data/cf/test_rating.csv')
#data = pd.read_csv('../data/ratings.csv')

#### Przygotowanie macierzy użytkownik-film

In [194]:
def create_sparse_matrix(df, user_col='userId', item_col='movieId', value_col='rating', rating_to_confidence=False, confidence_alpha=40):
    """
    Create a sparse matrix from the DataFrame.
    """
    users=pd.Categorical(df[user_col])
    items=pd.Categorical(df[item_col])

    if rating_to_confidence:
        # Convert ratings to confidence scores
        confidences = 1 + confidence_alpha * df[value_col]
        sparse_matrix = csr_matrix((confidences, (users.codes, items.codes)), shape=(len(users.categories), len(items.categories)))
    else:
        sparse_matrix = csr_matrix((df[value_col], (users.codes, items.codes)), shape=(len(users.categories), len(items.categories)))
    
    return sparse_matrix

train_sparse = create_sparse_matrix(train, rating_to_confidence=True)
#data_sparse = create_sparse_matrix(data)
#train_sparse, test_sparse = train_test_split(data_sparse, train_percentage=0.8)

Wykorzystanie funkcji Okapi BM25 w celu wyważenia wpływu bardzo popularnych filmów, które mogłyby dominować podobieństwa. <br> <br>
[Link do opisu działania Okapi BM25](https://en.wikipedia.org/wiki/Okapi_BM25) <br>

- Im większe K1, tym mniejsze różnice między popularnymi/rzadkimi
- Im większe B, tym silniejsza korekta na długość historii użytkownika

In [195]:
train_sparse = bm25_weight(train_sparse, K1=100, B=0.8)

Przydzielenie każdemu filmowi i użytkownikowi indeksu w tablicy

In [196]:
user_id__to_index = {user_ind: i for i, user_ind in enumerate(pd.Categorical(train.userId).categories)}
item_id__to_index = {item_ind: i for i, item_ind in enumerate(pd.Categorical(train.movieId).categories)}
index__toitem_id = {i: item_ind for i, item_ind in enumerate(pd.Categorical(train.movieId).categories)}
index__touser_id = {i: user_ind for i, user_ind in enumerate(pd.Categorical(train.userId).categories)}

#### Obliczenie podobieństw filmów z zapamiętaniem maksymalnie k najbliższych sąsiadów

In [197]:
k_value = 500
model = implicit.nearest_neighbours.CosineRecommender(K=k_value)
model.fit(train_sparse)

  0%|          | 0/84432 [00:00<?, ?it/s]

In [198]:
def calculate_similarity(sparse_matrix, k=10):
    """
    Calculate the cosine similarity between items up to k clossest neighbours.
    """
    item_similarity = cosine_similarity(sparse_matrix.T)
    np.fill_diagonal(item_similarity, 0)  # Ustawiamy przekątne na 0, aby nie uwzględniać podobieństwa do samego siebie

    top_k_indices = np.argsort(-item_similarity, axis=1)[:, :k]  # Indeksy k najbliższych sąsiadów
    top_k_values = np.take_along_axis(item_similarity, top_k_indices, axis=1)  # Wartości k najbliższych sąsiadów
    item_similarity = np.zeros_like(item_similarity) # Nowa macierz podobieństwa wypełniona zerami
    item_similarity[np.arange(item_similarity.shape[0])[:, None], top_k_indices] = top_k_values  # Wypełniamy macierz podobieństwa wartościami k najbliższych sąsiadów
    return item_similarity

#small_train_sparse = train_sparse[:10, :10]
#item_similarity = calculate_similarity(small_train_sparse, k=10)
#display(item_similarity[:5, :5])  # Wyświetlamy pierwsze 5 wierszy i kolumn macierzy podobieństwa

#### Rekomendacje dla konkretnego użytkownika

In [199]:
# Tworzenie mapowania filmów
movies = pd.read_csv("../data/movies.csv")

Przykład rekomendacji dla użytkownika o wskazanym id

In [200]:
user_id = user_id__to_index[4]
user_items = train_sparse.tocsr()
print("Filmy, które użytkownik polubił:", [index__toitem_id[ind] for ind in user_items[user_id].indices])

recommendations = model.recommend(userid=user_id, user_items=user_items[user_id], N=2, filter_already_liked_items=True)
print("Współczynniki podobieństwa:", recommendations[1])
print("Identyfikatory polecanych filmów:", [index__toitem_id[ind] for ind in recommendations[0]])

Filmy, które użytkownik polubił: [223, 1210, 1272, 1327, 1513, 1833, 2428, 2490, 2600, 2605, 2688, 2699, 2710, 2722, 2723, 2745, 2763, 2770, 2826, 2841, 3053, 3203]
Współczynniki podobieństwa: [804.11540111 791.64667836]
Identyfikatory polecanych filmów: [2683, 2716]


Przykład rekomendacji podobnych filmów na podstawie filmu (top X najwyższych wartości z macierzy podobieństwa w wierszu danego filmu)

In [201]:
ids, scores = model.similar_items(item_id__to_index[1], N=10)
pd.DataFrame({'movieId': [index__toitem_id[i] for i in ids], 'score': scores}).merge(movies, on='movieId')[['title', 'genres', 'score']].sort_values('score', ascending=False)

# Funkcja ta wykonuje poniższy kod, ale jest jedno-linijkowa, dlatego lepiej ją wykorzystać

#sim = model.similarity
#sim_row = sim[item_id__to_index[1]]
#sim_row = sim_row.toarray().flatten()
#top_sim_indices = np.argsort(sim_row)[-10:][::-1] # Wybranie 20 najwyższych podobieństw
#recommended_ids = [index__toitem_id[idx] for idx in top_sim_indices] #Id tych filmów
#pd.DataFrame({'movieId': recommended_ids, 'score': sim_row[top_sim_indices]}).merge(movies, on='movieId')[['title', 'genres', 'score']]

Unnamed: 0,title,genres,score
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0
1,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller,0.38761
2,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,0.385574
3,Forrest Gump (1994),Comedy|Drama|Romance|War,0.365474
4,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,0.357359
5,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,0.352937
6,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi,0.352243
7,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,0.348876
8,Back to the Future (1985),Adventure|Comedy|Sci-Fi,0.346082
9,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,0.344895


Funkcja do obliczenia podstawowych metryk stworzonego modelu

In [202]:
def ranking_metrics_at_k(model, train_user_items, test_user_items, K=10, show_progress=True):
    """
    Calculates ranking metrics (Precision@K, MAP@K, NDCG@K, AUC) for a trained model.

    Parameters:
        model : Trained ALS model (or other Implicit model).
        train_user_items : csr_matrix
            User-item interaction matrix used for training.
        test_user_items : csr_matrix
            User-item interaction matrix for evaluation.
        K : int
            Number of items to evaluate.
        show_progress : bool
            Show a progress bar during evaluation.

    Returns:
        dict : Dictionary with precision, MAP, NDCG, and AUC scores.
    """

    # Ensure matrices are in CSR format
    train_user_items = train_user_items.tocsr()
    test_user_items = test_user_items.tocsr()

    num_users, num_items = test_user_items.shape
    relevant = 0
    total_precision_div = 0
    total_map = 0
    total_ndcg = 0
    total_auc = 0
    total_users = 0

    # Compute cumulative gain for NDCG normalization
    cg = 1.0 / np.log2(np.arange(2, K + 2))  # Discount factor
    cg_sum = np.cumsum(cg)  # Ideal DCG normalization

    # Get users with at least one item in the test set
    users_with_test_data = np.where(np.diff(test_user_items.indptr) > 0)[0]

    # Progress bar
    progress = tqdm.tqdm(total=len(users_with_test_data), disable=not show_progress)

    batch_size = 1000
    start_idx = 0

    while start_idx < len(users_with_test_data):
        batch_users = users_with_test_data[start_idx:start_idx + batch_size]
        recommended_items, _ = model.recommend(batch_users, train_user_items[batch_users], N=K)
        start_idx += batch_size

        for user_idx, user_id in enumerate(batch_users):
            test_items = set(test_user_items.indices[test_user_items.indptr[user_id]:test_user_items.indptr[user_id + 1]])
            
            if not test_items:
                continue  # Skip users without test data

            num_relevant = len(test_items)
            total_precision_div += min(K, num_relevant)

            ap = 0
            hit_count = 0
            auc = 0
            idcg = cg_sum[min(K, num_relevant) - 1]  # Ideal Discounted Cumulative Gain (IDCG)
            num_negative = num_items - num_relevant

            for rank, item in enumerate(recommended_items[user_idx]):
                if item in test_items:
                    relevant += 1
                    hit_count += 1
                    ap += hit_count / (rank + 1)
                    total_ndcg += cg[rank] / idcg
                else:
                    auc += hit_count  # Accumulate hits for AUC calculation

            auc += ((hit_count + num_relevant) / 2.0) * (num_negative - (K - hit_count))
            total_map += ap / min(K, num_relevant)
            total_auc += auc / (num_relevant * num_negative)
            total_users += 1
        
        progress.update(len(batch_users))

    progress.close()

    # Compute final metrics
    precision = relevant / total_precision_div if total_precision_div > 0 else 0
    mean_ap = total_map / total_users if total_users > 0 else 0
    mean_ndcg = total_ndcg / total_users if total_users > 0 else 0
    mean_auc = total_auc / total_users if total_users > 0 else 0

    return {
        "precision": precision,
        "map": mean_ap,
        "ndcg": mean_ndcg,
        "auc": mean_auc
    }


Obliczenie metryk modelu

In [204]:
test_k = 10
test_sparse = create_sparse_matrix(test)
results = ranking_metrics_at_k(model, train_sparse, test_sparse, K=test_k, show_progress=True)

100%|██████████| 200948/200948 [00:43<00:00, 4615.84it/s]


In [205]:
print(f"Precision@{test_k}:", results["precision"])
print(f"MAP@{test_k}:", results["map"])
print(f"NDCG@{test_k}:", results["ndcg"])
print(f"AUC@{test_k}:", results["auc"])

Precision@10: 0.12231150851740738
MAP@10: 0.07284920388246789
NDCG@10: 0.12907832177558154
AUC@10: 0.5401255201625642


#### Przykładowa rekomendacja dla nowego użytkownika nie będącego w systemie

In [206]:
new_user_ratings = {
    98809: 5.0,
    106489: 5.0,
    89745: 5.0,
    79132: 5.0,
    99114: 5.0,
    2116: 5.0,
    4993: 5.0,
    5952: 5.0,
}
pd.DataFrame({'movieId': new_user_ratings.keys()}).merge(movies, on='movieId')[['title', 'genres']]

Unnamed: 0,title,genres
0,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy|IMAX
1,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy|IMAX
2,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX
3,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
4,Django Unchained (2012),Action|Drama|Western
5,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy
6,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
7,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy


Wzięcie pod uwagę tylko tych filmów, na których był wytrenowany model

In [207]:
valid_movies_ids = { movie_id: rating 
    for movie_id, rating in new_user_ratings.items()
    if movie_id in item_id__to_index
    }

Stworzenie wektora preferencji uzytkownika

In [208]:
n_movies = len(item_id__to_index)
new_user_verctor = np.zeros(n_movies)

for movie_id, rating in valid_movies_ids.items():
    new_user_verctor[item_id__to_index[movie_id]] = rating

new_user_verctor_sparse = csr_matrix(new_user_verctor).T

Obliczenie dwudziestu rekomendacji na podstawie macierzy podobieństwa modelu

In [209]:
similarity_matrix = model.similarity

predicted_ratings = similarity_matrix.dot(new_user_verctor_sparse)

predicted_ratings[new_user_verctor_sparse.nonzero()] = 0  # Ustawiamy oceny dla już ocenionych filmów na 0

predicted_array = predicted_ratings.toarray().flatten()
top_indices = np.argsort(predicted_array)[-20:][::-1] # Wybranie 20 najwyższych ocen
recommended_ids = [index__toitem_id[idx] for idx in top_indices] #Id tych filmów

pd.DataFrame({'movieId': recommended_ids, 'score': predicted_array[top_indices]}).merge(movies, on='movieId')[['title', 'genres', 'score']]

Unnamed: 0,title,genres,score
0,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,14.696779
1,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,12.965315
2,"Dark Knight Rises, The (2012)",Action|Adventure|Crime|IMAX,12.309172
3,"Matrix, The (1999)",Action|Sci-Fi|Thriller,12.102487
4,Interstellar (2014),Sci-Fi|IMAX,11.841249
5,Guardians of the Galaxy (2014),Action|Adventure|Sci-Fi,11.579821
6,Iron Man (2008),Action|Adventure|Sci-Fi,11.493211
7,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,11.205039
8,Batman Begins (2005),Action|Crime|IMAX,11.072531
9,Fight Club (1999),Action|Crime|Drama|Thriller,11.038804
