## Домашняя работа

### 1.	Подготовка данных

Использовать набор данных с информацией о пользователях, товарах и их оценках.

Изучите структуру данных: идентификаторы пользователей, идентификаторы фильмов, рейтинги и временные метки.

Провести предварительную обработку данных, такую как удаление пропущенных значений и фильтрация неактивных пользователей или товаров.

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("somnambwl/bookcrossing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/somnambwl/bookcrossing-dataset/versions/1


In [None]:
books = pd.read_csv(f'{path}/Books.csv', sep=";")
users = pd.read_csv(f'{path}/Users.csv', sep=";")
ratings = pd.read_csv(f'{path}/Ratings.csv', sep=";")

  users = pd.read_csv(f'{path}/Users.csv', sep=";")


In [None]:
books.head()

Unnamed: 0,ISBN,Title,Author,Year,Publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton & Company


In [None]:
users.head()

Unnamed: 0,User-ID,Age
0,1,
1,2,18.0
2,3,
3,4,17.0
4,5,


In [None]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [None]:
ratings["ISBN"] = ratings["ISBN"].apply(lambda x: x.strip().strip("\'").strip("\\").strip('\"').strip("\#").strip("("))

In [None]:
# Группируем данные и создаем дополнительные параметры для пользователей и книг
user_groupby = ratings.groupby("User-ID")  # Группируем данные по пользователям
book_groupby = ratings.groupby("ISBN")  # Группируем данные по книгам

# Вычисляем средний рейтинг пользователя и количество оценок, которые он поставил
average_user_rating = user_groupby["Rating"].mean()
number_of_ratings_by_user = user_groupby["Rating"].count()

# Вычисляем средний рейтинг книги и количество полученных ею оценок
average_book_rating = book_groupby["Rating"].mean()
number_of_book_ratings = book_groupby["Rating"].count()

# Переименовываем столбцы для удобства
average_user_rating.name = "avg_rating"
number_of_ratings_by_user.name = "N_ratings"
average_book_rating.name = "avg_rating"
number_of_book_ratings.name = "N_ratings"

In [None]:
# Объединяем с исходными датафреймами
users = users.join(number_of_ratings_by_user, on="User-ID")
users = users.join(average_user_rating, on="User-ID")
books = books.join(number_of_book_ratings, on="ISBN")
books = books.join(average_book_rating, on="ISBN")

# Заполняем пропущенные значения нулями
users["N_ratings"] = users["N_ratings"].fillna(0)
books["N_ratings"] = books["N_ratings"].fillna(0)

# Преобразуем количество оценок к целочисленному типу
users["N_ratings"] = users["N_ratings"].astype("int64")
books["N_ratings"] = books["N_ratings"].astype("int64")

In [None]:
print(f"Из {users.shape[0]} пользователей только {users['N_ratings'].gt(0).sum(axis=0)} оценили хотя бы 1 книгу.")
print(f"Только {users['N_ratings'].gt(1).sum(axis=0)} оценили хотя бы 2 книги.")
print(f"Только {users['N_ratings'].gt(9).sum(axis=0)} оценили хотя бы 10 книг.")
print(f"Самый активный пользователь оценил {users['N_ratings'].max()} книг.")
print()
print(f"Из {books.shape[0]} книг только {books['N_ratings'].gt(0).sum(axis=0)} получили хотя бы 1 оценку.")
print(f"Только {books['N_ratings'].gt(1).sum(axis=0)} получили хотя бы 2 оценки.")
print(f"Только {books['N_ratings'].gt(9).sum(axis=0)} получили хотя бы 10 оценок.")
print(f"Наиболее оцененная книга получила {books['N_ratings'].max()} оценок.")


Из 278859 пользователей только 99053 оценили хотя бы 1 книгу.
Только 43385 оценили хотя бы 2 книги.
Только 12306 оценили хотя бы 10 книг.
Самый активный пользователь оценил 13602 книг.

Из 271379 книг только 270171 получили хотя бы 1 оценку.
Только 124513 получили хотя бы 2 оценки.
Только 17480 получили хотя бы 10 оценок.
Наиболее оцененная книга получила 2502 оценок.


In [None]:
# Статистика пользователей, которые оценили хотя бы одну книгу
users[users["N_ratings"].gt(0)].describe()

Unnamed: 0,N_ratings,avg_rating
count,99053.0,99053.0
mean,10.935479,4.393597
std,91.056053,3.411184
min,1.0,0.0
25%,1.0,0.0
50%,1.0,4.6
75%,4.0,7.5
max,13602.0,10.0


Самая популярная книжная серия по данным Goodreads — Гарри Поттер.
Первая книга в серии имеет 6.1 млн оценок, а последняя почти достигла максимального среднего рейтинга (4.62 из 5).

Найдем самые популярные книги и проверим, как выделяется Гарри Поттер

In [None]:
# Получаем книгу с наибольшим количеством оценок в нашем наборе данных
books[books["N_ratings"] == books["N_ratings"].max()]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,N_ratings,avg_rating
26,971880107,Wild Animus,Rich Shapero,2004,Too Far,2502,1.019584


С таким низким средним рейтингом (1 из 10, где 0 — минимум) неудивительно, что мало кто слышал о "Wild Animus" Рича Шаперо.


In [None]:
# Топ-10 лучших книг по среднему рейтингу (из тех, у которых более 20 оценок)
books.loc[books["N_ratings"] > 20].sort_values(by="avg_rating", ascending=False).head(10)

Unnamed: 0,ISBN,Title,Author,Year,Publisher,N_ratings,avg_rating
3028,1844262553,Free,Paul Vincent,2003,Upfront Publishing,54,7.962963
27557,8445071416,El Hobbit,J. R. R. Tolkien,1991,Minotauro,23,7.478261
1101,60256672,Where the Sidewalk Ends : Poems and Drawings,Shel Silverstein,1974,HarperCollins,33,7.121212
16191,60248025,Falling Up,Shel Silverstein,1996,HarperCollins,38,6.921053
1763,395177111,The Hobbit (Leatherette Collector's Edition),J. R. R. Tolkien,1973,Houghton Mifflin Company,24,6.791667
10288,553274325,Johnny Got His Gun,Dalton Trumbo,1983,Bantam Books,32,6.75
15179,836221362,It's A Magical World: A Calvin and Hobbes Coll...,Bill Watterson,1996,Andrews McMeel Publishing,28,6.642857
5432,439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,170,6.611765
5431,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,194,6.541237
21546,1563890119,"Preludes and Nocturnes (Sandman, Book 1)",Neil Gaiman,1993,DC Comics,27,6.481481


In [None]:
# Get all Harry Potter books and editions written by Rowling
books[books["Title"].str.contains("Harry Potter") & books["Author"].str.contains("Rowling")]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,N_ratings,avg_rating
2143,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,J. K. Rowling,1999,Arthur A. Levine Books,571,4.900175
2809,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,168,6.363095
3459,0439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,351,4.729345
3839,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,197,6.467005
5431,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,194,6.541237
...,...,...,...,...,...,...,...
234626,0439294827,Harry Potter and the Sorcerer's Stone: A Delux...,J. K. Rowling,2001,Scholastic,2,9.000000
237878,3895849618,Harry Potter und der Gefangene von Askaban. So...,Joanne K. Rowling,2002,Dhv der HÃ¶rverlag,1,7.000000
251249,847888663X,Harry Potter y El Caliz de Fuego,J. K. Rowling,2001,Lidiun,0,
257280,043955490X,Harry Potter and the Goblet of Fire (Harry Pot...,J. K. Rowling,2003,Arthur A. Levine Books,1,10.000000


In [None]:
ratings["Rating"] = ratings["Rating"].astype("int8")

pd_matrix = pd.merge(books.loc[books["N_ratings"] > 20, "ISBN"], ratings, how="left", left_on="ISBN", right_on="ISBN").drop_duplicates()
pd_matrix

Unnamed: 0,ISBN,User-ID,Rating
0,0399135782,8,0
1,0399135782,11676,9
2,0399135782,29526,9
3,0399135782,36836,0
4,0399135782,46398,9
...,...,...,...
377631,0152012397,240700,7
377632,0152012397,241078,8
377633,0152012397,256167,8
377634,0152012397,259286,8


In [None]:
# Изменила форму так, чтобы ISBN был индексом строки, User-ID — индексом столбца, а значения — рейтингами
pd_matrix = pd_matrix.pivot(index='ISBN', columns='User-ID', values='Rating').fillna(0).astype("int8")
pd_matrix

User-ID,8,9,10,14,16,17,19,26,32,39,...,278831,278832,278836,278838,278843,278844,278846,278849,278851,278854
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0006547834,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0006550576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0006550789,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0007110928,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8845205118,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8845247414,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884590184X,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8885989403,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Фильтруем пользователей, у которых менее 2 оценок (иначе stratify не работает)
user_counts = ratings['User-ID'].value_counts()
valid_users = user_counts[user_counts > 1].index
ratings = ratings[ratings['User-ID'].isin(valid_users)]

# Оставляем ТОП-1000 пользователей и книг
top_users = ratings['User-ID'].value_counts().nlargest(1000).index
ratings = ratings[ratings['User-ID'].isin(top_users)]

top_books = ratings['ISBN'].value_counts().nlargest(1000).index
ratings = ratings[ratings['ISBN'].isin(top_books)]

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['User-ID']
X_train, X_test = train_test_split(X, test_size=0.25, stratify=y, random_state=42)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def baseline(user_id, book_isbn):
    return 5.0  # - средняя из 10

def score(cf_model):
    id_pairs = zip(X_test['User-ID'], X_test['ISBN'])
    y_pred = np.array([cf_model(user, book) for (user, book) in id_pairs])
    y_true = np.array(X_test['Rating'])
    return rmse(y_true, y_pred)

print("Baseline RMSE:", score(baseline))

Baseline RMSE: 4.667027223609752


### 2.	Реализация User-Item коллаборативной фильтрации
2.1. Построить матрицу взаимодействий (User-Item), где строки — это пользователи, а столбцы — это товары (например, фильмы), значения — оценки или количество взаимодействий.

In [None]:
r_matrix = X_train.pivot_table(values='Rating', index='User-ID', columns='ISBN')
r_matrix = r_matrix.fillna(0)

r_matrix

ISBN,002026478X,002542730X,0060008032,0060085444,0060096195,006016848X,0060175400,0060199652,0060391626,0060392452,...,1558745157,1558745718,1559029838,1565122968,1573225789,1573229326,1573229725,1576737330,1592400876,1878424319
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277427,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
277639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
278188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2.2. Реализовать алгоритм коллаборативной фильтрации на основе сходства пользователей или товаров:

User-based: Находить похожих пользователей на основе их оценок.

In [None]:
user_similarity = cosine_similarity(r_matrix)
user_sim_df = pd.DataFrame(user_similarity, index=r_matrix.index, columns=r_matrix.index)

def predict_user_based(user_id, book_isbn):
    if book_isbn not in r_matrix.columns or user_id not in r_matrix.index:
        return 5.0

    sim_scores = user_sim_df[user_id]
    book_ratings = r_matrix[book_isbn]

    numerator = np.dot(sim_scores, book_ratings)
    denominator = sim_scores.sum()

    return numerator / denominator if denominator != 0 else 5.0

print("User-Based CF RMSE:", score(predict_user_based))

User-Based CF RMSE: 3.9714383178918076



Item-based: Находить похожие товары на основе оценок пользователей.

In [None]:
item_similarity = cosine_similarity(r_matrix.T)
item_sim_df = pd.DataFrame(item_similarity, index=r_matrix.columns, columns=r_matrix.columns)

def predict_item_based(user_id, book_isbn):
    if book_isbn not in r_matrix.columns or user_id not in r_matrix.index:
        return 5.0

    sim_scores = item_sim_df[book_isbn]
    user_ratings = r_matrix.loc[user_id]

    numerator = np.dot(sim_scores, user_ratings)
    denominator = sim_scores.sum()

    return numerator / denominator if denominator != 0 else 5.0

print("Item-Based CF RMSE:", score(predict_item_based))

Item-Based CF RMSE: 3.860448960863674


###  3.	Рекомендации
3.1. Для заданного пользователя предсказать оценки для товаров, которые он ещё не оценил, на основе похожих пользователей или товаров.

3.2. Выдать пользователю список рекомендованных товаров.


In [None]:
# Функция для предсказания рейтингов книг, которые пользователь еще не оценил
def get_recommendations(user_id, model, n=5):
    # Фильтруем книги, которые пользователь уже читал
    read_books = set(ratings[ratings['User-ID'] == user_id]['ISBN'])
    all_books = set(r_matrix.columns)
    unseen_books = list(all_books - read_books)

    if not unseen_books:
        print(f"Нет непрочитанных книг для пользователя {user_id}.")
        return []

    # Предсказываем рейтинг для всех непрочитанных книг
    predicted_ratings = {book: model(user_id, book) for book in unseen_books}

    # Сортируем книги по предсказанному рейтингу
    recommended_books = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)

    return recommended_books[:n]

print("Топ-5 рекомендаций (User-Based) для пользователя 254:")
user_based_recommendations = get_recommendations(254, predict_user_based)
print(user_based_recommendations)

Топ-5 рекомендаций (User-Based) для пользователя 254:
[('059035342X', 1.6087810287680804), ('0345337662', 0.9573041269212664), ('0439139600', 0.7915145942102251), ('0446310786', 0.7184092058607578), ('0140067477', 0.7089272506291101)]


In [None]:
print("Топ-5 рекомендаций (Item-Based) для пользователя 254:")
item_based_recommendations = get_recommendations(254, predict_item_based)
print(item_based_recommendations)

Топ-5 рекомендаций (Item-Based) для пользователя 254:
[('0345404777', 5.0), ('0425165701', 5.0), ('051512463X', 0.5756368127866073), ('038542471X', 0.4331698036242089), ('0671036505', 0.4046766237139051)]


In [None]:
def show_recommendations(user_id, model, n=5):
    recommendations = get_recommendations(user_id, model, n)

    if not recommendations:
        return

    # Сопоставляем ID книг с их названиями
    recommendations = [(books[books['ISBN'] == isbn]['Title'].values[0], rating)
                       for isbn, rating in recommendations if isbn in books['ISBN'].values]

    # Выводим результат
    print(f"\nТоп-{n} книг для пользователя {user_id}:")
    for title, rating in recommendations:
        print(f"{title} - предсказанный рейтинг: {rating:.2f}")

print("User-Based:")
show_recommendations(254, predict_user_based)

print("\nItem-Based:")
show_recommendations(254, predict_item_based)

User-Based:

Топ-5 книг для пользователя 254:
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback)) - предсказанный рейтинг: 1.61
Interview with the Vampire - предсказанный рейтинг: 0.96
Harry Potter and the Goblet of Fire (Book 4) - предсказанный рейтинг: 0.79
To Kill a Mockingbird - предсказанный рейтинг: 0.72
The Tao of Pooh - предсказанный рейтинг: 0.71

Item-Based:

Топ-5 книг для пользователя 254:
No Safe Place - предсказанный рейтинг: 5.00
Ruthless.Com (Tom Clancy's Power Plays (Paperback)) - предсказанный рейтинг: 5.00
The Cat Who Sang for the Birds (Cat Who... (Paperback)) - предсказанный рейтинг: 0.58
The Client - предсказанный рейтинг: 0.43
The Vineyard - предсказанный рейтинг: 0.40


### 4.	Оценка модели
4.1. Разделить данные на обучающую и тестовую выборки.

4.2. Использовать метрики качества для оценки модели, такие как RMSE (Root Mean Squared Error) или MAE (Mean Absolute Error).

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(ratings, test_size=0.25, stratify=ratings['User-ID'], random_state=42)

X_train.shape, X_test.shape

((48234, 3), (16079, 3))

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def eval_model(model):
    id_pairs = zip(X_test['User-ID'], X_test['ISBN'])  # Создаем пары (пользователь, книга)
    y_pred = np.array([model(user, book) for (user, book) in id_pairs])  # Предсказания
    y_true = np.array(X_test['Rating'])  # Реальные оценки

    print(f"RMSE: {rmse(y_true, y_pred):.4f}")
    print(f"MAE: {mae(y_true, y_pred):.4f}")

print("\nОценка User-Based модели CF:")
eval_model(predict_user_based)

print("\nОценка Item-Based модели CF:")
eval_model(predict_item_based)


Оценка User-Based модели CF:
RMSE: 3.9714
MAE: 2.1442

Оценка Item-Based модели CF:
RMSE: 3.8604
MAE: 1.9744



4.3. Проанализировать результаты и предложить улучшения.
Например, варьировать параметры и наблюдать, как изменяются метрики (RMSE, MAE).

In [None]:
def predict_item_based(user_id, book_id, k=10):
    # Проверяем, есть ли книга в данных
    if book_id not in item_sim_df.index:
        return 6.0  # Средний рейтинг по шкале Book-Crossing (1-10)

    if user_id not in r_matrix.index:
        return 6.0
    # Берем схожесть книги с другими книгами
    similar_books = item_sim_df[book_id].drop(book_id, errors='ignore')

    # Отбираем топ-K наиболее похожих книг
    top_k_books = similar_books.nlargest(k)

    # Берем оценки пользователя для этих книг
    user_ratings = r_matrix.loc[user_id, top_k_books.index]

    # Убираем книги, которые пользователь не оценивал
    valid_indices = user_ratings.notna()
    top_k_books = top_k_books[valid_indices]
    user_ratings = user_ratings[valid_indices]

    # Вычисляем взвешенное среднее
    numerator = np.dot(top_k_books, user_ratings)
    denominator = top_k_books.sum()

    return numerator / denominator if denominator != 0 else 6.0

In [None]:
for k in [5, 10, 20]:
    print(f"\nОценка Item-Based CF (k={k}):")
    eval_model(lambda user, book: predict_item_based(user, book, k=k))


Оценка Item-Based CF (k=5):
RMSE: 3.8074
MAE: 1.9147

Оценка Item-Based CF (k=10):
RMSE: 3.8126
MAE: 1.9281

Оценка Item-Based CF (k=20):
RMSE: 3.8209
MAE: 1.9388


In [None]:
def predict_user_based(user_id, book_id, k=10):
    # Проверяем, есть ли книга в данных
    if book_id not in r_matrix.columns:
        return 6.0
    if user_id not in user_sim_df.index:
        return 6.0

    # Берем вектор схожести текущего пользователя
    sim_scores = user_sim_df[user_id]

    # Берем оценки всех пользователей для этой книги
    book_ratings = r_matrix[book_id]

    # Отбираем топ-k похожих пользователей
    top_k_users = sim_scores.nlargest(k + 1).iloc[1:]  # Первый элемент — сам пользователь, убираем его

    # Берем оценки только от топ-k пользователей
    top_k_ratings = book_ratings[top_k_users.index]

    # Вычисляем предсказанный рейтинг (взвешенное среднее)
    numerator = np.dot(top_k_users, top_k_ratings.fillna(0))
    denominator = top_k_users.sum()

    return numerator / denominator if denominator != 0 else 6.0

In [None]:
for k in [5, 10, 20]:
    print(f"\nОценка User-Based CF (k={k}):")
    eval_model(lambda user, book: predict_user_based(user, book, k=k))


Оценка User-Based CF (k=5):
RMSE: 4.0216
MAE: 2.1048

Оценка User-Based CF (k=10):
RMSE: 4.0032
MAE: 2.1196

Оценка User-Based CF (k=20):
RMSE: 3.9972
MAE: 2.1359


###  5.	Проанализировать недостатки User-based подхода, такие как холодный старт и проблемы со слишком редкими данными.


**1️. Проблема холодного старта**  
- Если у нового пользователя мало (или нет) оценок, модель не может подобрать похожих пользователей.  
- Решение: использовать **гибридную модель**, комбинируя User-Based и Item-Based подходы.

**2️. Разреженность данных**  
В реальных данных **пользователи оценивают мало книг**, из-за чего модель не находит схожих пользователей.  
**Решение:**
   - Увеличить `k`, чтобы учитывать больше соседей.  
   - Использовать **SVD (Singular Value Decomposition)** или **Matrix Factorization**.

**3️. Масштабируемость**  
Для большого числа пользователей расчет сходства занимает **много времени**.  
Решение:  
   - Использовать **KNN (K-Nearest Neighbors)** вместо полного сравнения.  
   - Кластеризовать пользователей (например, с **K-Means**) перед поиском.

**Вывод:**  
- User-Based CF работает лучше для **активных пользователей**, но плохо справляется с новыми.  
- Item-Based CF устойчивее к разреженности, но требует хорошей кластеризации книг.  
- Лучший вариант — **гибридный подход** + **алгоритмы факторизации матриц**.

### 6.	Перейти к Item-based коллаборативной фильтрации, сравнить результаты.

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Создаем матрицу рейтингов пользователей (user_id × book_id)
r_matrix_train = X_train.pivot_table(values='Rating', index='User-ID', columns='ISBN')
r_matrix_train = r_matrix_train.fillna(0)

# Вычисляем косинусное сходство между книгами (item-based similarity)
item_similarity = cosine_similarity(r_matrix_train.T)  # Транспонируем для сравнения книг
item_similarity_df = pd.DataFrame(item_similarity, index=r_matrix_train.columns, columns=r_matrix_train.columns)

item_similarity_df.head()

ISBN,002026478X,002542730X,0060008032,0060085444,0060096195,006016848X,0060175400,0060199652,0060391626,0060392452,...,1558745157,1558745718,1559029838,1565122968,1573225789,1573229326,1573229725,1576737330,1592400876,1878424319
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002026478X,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.122582,0.0
002542730X,0.0,1.0,0.086618,0.0,0.0,0.065891,0.066392,0.0,0.062898,0.025824,...,0.061859,0.0,0.0,0.0,0.052454,0.120836,0.0,0.068864,0.0,0.0
0060008032,0.0,0.086618,1.0,0.145465,0.0,0.315275,0.150478,0.0,0.142558,0.0,...,0.0,0.0,0.0,0.0,0.118886,0.136937,0.0,0.0,0.0,0.0
0060085444,0.0,0.0,0.145465,1.0,0.10366,0.0,0.0,0.089147,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.193267,0.104079,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.10366,1.0,0.0,0.094539,0.0,0.0,0.0,...,0.0,0.0,0.220934,0.236016,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
def predict_item_based(user_id, book_id, k=10):
    # Предсказывает рейтинг для книги на основе похожих, которые оценил пользователь
    # Проверяем, есть ли книга в данных
    if book_id not in item_similarity_df.index:
        return 6.0
    if user_id not in r_matrix_train.index:
        return 6.0
    # Находим похожие книги
    similar_books = item_similarity_df[book_id].sort_values(ascending=False)[1:k+1]

    # Берем оценки пользователя для этих книг
    user_ratings = r_matrix_train.loc[user_id, similar_books.index]

    # Убираем книги, которые пользователь не оценивал
    valid_indices = user_ratings.notna()
    top_k_books = similar_books[valid_indices]
    user_ratings = user_ratings[valid_indices]

    # Вычисляем взвешенное среднее
    numerator = np.dot(top_k_books, user_ratings)
    denominator = top_k_books.sum()

    return numerator / denominator if denominator != 0 else 6.0

print(f"Пример предсказанного рейтинга: {predict_item_based(50, '002026478X')}")

Пример предсказанного рейтинга: 6.0


In [None]:
print("\nОценка User-Based CF:")
eval_model(predict_user_based)

print("\nОценка Item-Based CF:")
eval_model(predict_item_based)


Оценка User-Based CF:
RMSE: 4.0032
MAE: 2.1196

Оценка Item-Based CF:
RMSE: 3.8126
MAE: 1.9281
