In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.cluster import KMeans
from sklearn.manifold import spectral_embedding, SpectralEmbedding
from sklearn.metrics import mean_squared_error, mean_absolute_error, ndcg_score
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display
from tqdm.notebook import tqdm

In [None]:
!curl -L "https://www.dropbox.com/scl/fi/8ba6yivfqember870awen/Video_Games_5.json.gz?rlkey=y8l7biii4mhc71os9eezespxo&st=7jiqtj4n&dl=1" -o data/Video_Games_5.json.gz

# 1. Загрузка и подготовка данных

In [None]:
def load_dataset(path, sep=',', skiprows=0):
    """Загружает датасеты"""
    df = pd.read_csv(
        path,
        sep=sep,
        names=['user_id', 'item_id', 'rating', 'timestamp'],
        skiprows=skiprows
    )
    return df[['user_id', 'item_id', 'rating']]

In [None]:
def load_dataset_json(path):
    """Загружает датасеты формата json"""
    import gzip
    import json

    def parse(path):
      g = gzip.open(path, 'rb')
      for l in g:
        yield json.loads(l)

    def getDF(path):
      i = 0
      df = {}
      for d in parse(path):
        df[i] = d
        i += 1
      return pd.DataFrame.from_dict(df, orient='index')

    df = getDF(path)[['reviewerID', 'asin', 'overall']].rename(columns={'reviewerID': 'user_id', 'asin': 'item_id', 'overall': 'rating'})
    return df

In [None]:
def build_user_item_matrix(df):
    """Строит матрицу взаимодействий (users × items)."""
    R = df.pivot_table(
        index='user_id', columns='item_id', values='rating', fill_value=0
    )
    return R

In [None]:
def split_train_test_matrix(R, test_size=0.2, random_state=42):
    """Случайное разделение взаимодействий на train/test."""
    df.columns = ['user_id', 'item_id', 'rating']
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    R_train = build_user_item_matrix(train_df)
    return R_train, test_df

# 2. Базовые методы CF

In [None]:
def user_based_cf_predict(R_train):
    """User-based CF: взвешенная сумма по cosine similarity."""
    S = cosine_similarity(R_train)
    # нормализация по сумме абсолютных весов
    denom = np.abs(S).sum(axis=1, keepdims=True)
    P = S.dot(R_train.values) / denom
    return pd.DataFrame(P, index=R_train.index, columns=R_train.columns)

In [None]:
def item_based_cf_predict(R_train):
    """Item-based CF по аналогии."""
    S_item = cosine_similarity(R_train.T)
    denom = np.abs(S_item).sum(axis=1, keepdims=True)
    P = R_train.values.dot(S_item) / denom.T
    return pd.DataFrame(P, index=R_train.index, columns=R_train.columns)

# 3. Матричная факторизация (NMF)

In [None]:
def nmf_predict(R_train, n_components=20, random_state=42):
    """Аппроксимирует R_train через NMF, возвращает полную матрицу предсказаний."""
    model = NMF(n_components=n_components, init='random', random_state=random_state, max_iter=300)
    U = model.fit_transform(R_train.values)
    V = model.components_
    P = U.dot(V)
    return pd.DataFrame(P, index=R_train.index, columns=R_train.columns)

# 4. Спектральная кластеризация

In [None]:
def spectral_clustering_predict(R_train, n_clusters=20, embed_dim=20, random_state=42):
    """Строит граф сходств и использует spectral embedding + k-means для рекомендаций."""
    # матрица сходств пользователей
    A = cosine_similarity(R_train)
    # спектральные эмбеддинги
    embedding = spectral_embedding(A, n_components=embed_dim, random_state=random_state)
    # кластеризация k-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(embedding)
    # для каждого юзера средний рейтинг внутри кластера
    R = R_train.copy()
    R['cluster'] = labels
    preds = np.zeros_like(R_train.values, dtype=float)
    for c in np.unique(labels):
        idx = np.where(labels == c)[0]
        cluster_mean = R_train.values[idx].mean(axis=0)
        preds[idx] = cluster_mean
    return pd.DataFrame(preds, index=R_train.index, columns=R_train.columns)

In [None]:
def spectral_clustering_knn_predict(R_train, n_clusters=20, embed_dim=20, n_neighbors=5, random_state=42):
    """Строит граф сходств и использует spectral embedding with knn + k-means для рекомендаций."""
    # матрица сходств пользователей
    se = SpectralEmbedding(
        n_components=embed_dim,
        affinity='nearest_neighbors',
        n_neighbors=n_neighbors)
    # спектральные эмбеддинги
    embedding = se.fit_transform(R_train.values)
    # кластеризация k-means
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = kmeans.fit_predict(embedding)
    # для каждого юзера средний рейтинг внутри кластера
    R = R_train.copy()
    R['cluster'] = labels
    preds = np.zeros_like(R_train.values, dtype=float)
    for c in np.unique(labels):
        idx = np.where(labels == c)[0]
        cluster_mean = R_train.values[idx].mean(axis=0)
        preds[idx] = cluster_mean
    return pd.DataFrame(preds, index=R_train.index, columns=R_train.columns)

# 5. Гибридный метод

In [None]:
def hybrid_predict(P_spec, P_nmf, alpha=0.5):
    """Гибридизация: взвешенное среднее спектральных и NMF предсказаний."""
    return alpha * P_spec + (1 - alpha) * P_nmf

# 6. Подбор гиперпараметров

In [None]:
def tune_nmf_components(R_train, R_test, comps):
    """Подбор гиперпараметров в NMF"""
    best, best_rmse = None, np.inf
    for c in tqdm(comps):
        P = nmf_predict(R_train, n_components=c)
        rmse = evaluate_regression(P, R_test)['RMSE']
        if rmse < best_rmse:
            best_rmse, best = rmse, c
    return best

In [None]:
def tune_spectral_params(R_train, R_test, clusters, embeds):
    """Подбор гиперпараметров в спектральной кластеризации с cosine_similarity"""
    best, best_rmse = (None, None), np.inf
    for nc in tqdm(clusters):
        for ed in embeds:
            P = spectral_clustering_predict(R_train, n_clusters=nc, embed_dim=ed)
            rmse = evaluate_regression(P, R_test)['RMSE']
            if rmse < best_rmse:
                best_rmse, best = rmse, (nc, ed)
    return best

In [None]:
def tune_spectral_knn_params(R_train, R_test, n_neighbors_lst, n_clusters=20, embed_dim=20):
    """Подбор гиперпараметров в спектральной кластеризации с knn"""
    best, best_rmse = None, np.inf
    for n in tqdm(n_neighbors_lst):
        P = spectral_clustering_knn_predict(R_train, n_clusters=n_clusters, embed_dim=embed_dim, n_neighbors=n)
        rmse = evaluate_regression(P, R_test)['RMSE']
        if rmse < best_rmse:
            best_rmse, best = rmse, n
    return best

In [None]:
def tune_hybrid_alpha(P_spec, P_nmf, R_test, alphas):
    """Подбор гиперпараметров в гибридной модели"""
    best, best_rmse = None, np.inf
    for a in tqdm(alphas):
        P = hybrid_predict(P_spec, P_nmf, alpha=a)
        rmse = evaluate_regression(P, R_test)['RMSE']
        if rmse < best_rmse:
            best_rmse, best = rmse, a
    return best

# 7. Метрики оценки

In [None]:
def evaluate_regression(P, R_test):
    """RMSE и MAE по тестовым точечным наблюдениям."""
    y_true = R_test['rating'].values
    y_pred = [
        P.loc[u, i] if (u in P.index and i in P.columns) else np.nan
        for u, i in zip(R_test['user_id'], R_test['item_id'])
    ]
    mask = ~np.isnan(y_pred)
    return {
        'RMSE': np.sqrt(mean_squared_error(y_true[mask], np.array(y_pred)[mask])),
        'MAE' : mean_absolute_error(y_true[mask], np.array(y_pred)[mask])
    }

In [None]:
def evaluate_ranking(P, R_train, R_test, k=10):
    """Precision@k, Recall@k, NDCG@k, Coverage."""
    # формируем релевантности в тесте (rating >= 4)
    rel = R_test[R_test['rating'] >= 4].groupby('user_id')['item_id'].apply(set).to_dict()
    precisions, recalls, ndcgs = [], [], []
    recommended_items = set()
    for u in P.index:
        # исключаем уже оцененные в train
        seen = set(R_train.loc[u][R_train.loc[u] > 0].index)
        preds_u = P.loc[u].drop(labels=seen)
        top_k = preds_u.nlargest(k).index.tolist()
        recommended_items.update(top_k)
        true_set = rel.get(u, set())
        hits = len(set(top_k) & true_set)
        precisions.append(hits / k)
        recalls.append(hits / (len(true_set) or 1))
        # NDCG
        y_true = [1 if item in true_set else 0 for item in top_k]
        y_score = [preds_u[item] for item in top_k]
        ndcgs.append(ndcg_score([y_true], [y_score]))
    coverage = len(recommended_items) / R_train.shape[1]
    return {
        'Precision@10': np.mean(precisions),
        'Recall@10'   : np.mean(recalls),
        'NDCG@10'     : np.mean(ndcgs),
        'Coverage'    : coverage
    }

# 8. Эксперимент

In [None]:
dataset_loaders = {
    'AmazonArtsCraftsAndSewing': lambda: load_dataset_json('./data/Arts_Crafts_and_Sewing_5.json.gz'),
    'AmazonDigitalMusic': lambda: load_dataset_json('./data/Digital_Music_5.json.gz'),
    'AmazonVideoGames': lambda: load_dataset_json('./data/Video_Games_5.json.gz'),
    'MovieLens1M': lambda: load_dataset(path='./data/MovieLens1M.dat', sep='::'),
}

final_results = []

In [None]:
for name, loader in tqdm(dataset_loaders.items()):
    df = loader()
    R_train, R_test = split_train_test_matrix(build_user_item_matrix(df), test_size=0.2)

    # Проверка разреженности
    num_users, num_items = R_train.shape
    nonzeros = (R_train > 0).sum().sum()
    sparsity = 1 - nonzeros / (num_users * num_items)
    print(f"Dataset {name}: sparsity = {sparsity:.4f}")

    # Подбор гиперпараметров
    best_nmf = tune_nmf_components(R_train, R_test, comps=[10, 20, 50])
    best_spec = tune_spectral_params(R_train, R_test, clusters=[10, 20, 50], embeds=[5, 10, 15, 25, 50])
    best_n_neighbors = tune_spectral_knn_params(R_train, R_test, n_neighbors_lst=[5, 10, 20], n_clusters=best_spec[0], embed_dim=best_spec[1])

    # Обучение моделей
    P_user     = user_based_cf_predict(R_train)
    P_item     = item_based_cf_predict(R_train)
    P_nmf      = nmf_predict(R_train, n_components=best_nmf)
    P_spec     = spectral_clustering_predict(R_train, n_clusters=best_spec[0], embed_dim=best_spec[1])
    P_spec_knn = spectral_clustering_knn_predict(R_train, n_clusters=best_spec[0], embed_dim=best_spec[1], n_neighbors=best_n_neighbors)

    best_alpha = tune_hybrid_alpha(P_spec, P_nmf, R_test, alphas=[0.3, 0.4, 0.5])
    P_hyb      = hybrid_predict(P_spec, P_nmf, alpha=best_alpha)
    best_alpha = tune_hybrid_alpha(P_spec_knn, P_nmf, R_test, alphas=[0.3, 0.4, 0.5])
    P_hyb_knn  = hybrid_predict(P_spec_knn, P_nmf, alpha=best_alpha)

    methods = {
        'User-CF'     : P_user,
        'Item-CF'     : P_item,
        'NMF'         : P_nmf,
        'Spectral_cos': P_spec,
        'Spectral_KNN': P_spec_knn,
        'Hybrid_cos'  : P_hyb,
        'Hybrid_KNN'  : P_hyb_knn,
    }

    for method, P_std in tqdm(methods.items()):
        reg = evaluate_regression(P_std, R_test)
        rank = evaluate_ranking(P_std, R_train, R_test)

        row = {
            'Dataset'     : name,
            'Method'      : method,
            'RMSE'        : reg['RMSE'],
            'MAE'         : reg['MAE'],
            'Precision@10': rank['Precision@10'],
            'Recall@10'   : rank['Recall@10'],
            'NDCG@10'     : rank['NDCG@10'],
            'Coverage'    : rank['Coverage'],
        }
        final_results.append(row)

In [None]:
final_df = pd.DataFrame(final_results).set_index(['Dataset', 'Method'])
print("Final comparison table:")
display(final_df)
final_df.to_csv('./results/final.csv')

# 9. Анализ зависимости качества от размерности эмбеддингов

In [None]:
# рассмотрим зависимость в датасете 'MovieLens1M'
df = dataset_loaders['MovieLens1M']()
R_train, R_test = split_train_test_matrix(build_user_item_matrix(df), test_size=0.2)

# наилучшее качество при n_clusters == 50, поэтому фиксируем его
for ed in tqdm([2, 5, 7, 10, 15, 25, 40, 50, 75, 100]):
    P_std = spectral_clustering_predict(R_train, n_clusters=50, embed_dim=ed)
    reg = evaluate_regression(P_std, R_test)
    rank = evaluate_ranking(P_std, R_train, R_test)

    row = {
        'Embed_dim'   : ed,
        'RMSE'        : reg['RMSE'],
        'MAE'         : reg['MAE'],
        'Precision@10': rank['Precision@10'],
        'Recall@10'   : rank['Recall@10'],
        'NDCG@10'     : rank['NDCG@10'],
        'Coverage'    : rank['Coverage'],
    }
    final_results.append(row)

final_df = pd.DataFrame(final_results).set_index('Embed_dim')
final_df.to_csv('./results/embed_dim.csv')

# 10. Тест визуала

In [None]:
movies_df = pd.read_csv('./data/movies.dat', sep='::', names=['item_id', 'title', 'genre'])
df = dataset_loaders['MovieLens1M']()
R_train, R_test = split_train_test_matrix(build_user_item_matrix(df), test_size=0.2)

A = cosine_similarity(R_train.values)
emb = spectral_embedding(A, n_components=2, random_state=42)
labels = KMeans(n_clusters=50, random_state=42).fit_predict(emb)

# Для каждого кластера находим топ-3 фильма:
cluster_top3 = {}
for c in np.unique(labels):
    users_idx = np.where(labels == c)[0]
    # берём подматрицу рейтингов этих пользователей
    sub = R_train.iloc[users_idx]
    # считаем средний рейтинг по каждому item (игнорируем нули)
    mean_ratings = sub.replace(0, np.NaN).mean(axis=0).dropna()
    top3_ids = mean_ratings.sort_values(ascending=False).head(3).index
    top3_titles = movies_df.set_index('item_id').loc[top3_ids, 'title'].tolist()
    cluster_top3[c] = top3_titles

# Вычисляем центры кластеров в 2D:
centers = []
for c in np.unique(labels):
    pts = emb[labels == c]
    centers.append(pts.mean(axis=0))
centers = np.vstack(centers)

In [None]:
# 4. Рисуем scatter и аннотации:
fig = plt.figure(figsize=(30,20))
plt.scatter(emb[:,0], emb[:,1], c=labels, cmap='tab20', s=10, alpha=0.6)
for c, (x,y) in enumerate(centers):
    titles = cluster_top3[c]
    text = "\n".join(titles)
    plt.text(x, y, text, fontsize=8, ha='center', va='center',
             bbox=dict(boxstyle='round, pad=0.3', fc='white', alpha=0.7))
plt.title("Spectral Clustering of Users with Top-3 Movies per Cluster")
plt.xlabel("Spectral Component 1")
plt.ylabel("Spectral Component 2")
plt.xlim([-0.002, 0.0015])
plt.ylim([-0.002, 0.002])
plt.grid(True)
plt.show()
fig.savefig('./results/top-3 movies in each cluster.png')