In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from scipy.sparse import csr_matrix, lil_matrix, vstack, hstack

tqdm.pandas()

In [None]:
# Load data
read = pd.read_pickle('../Pickle/read.pkl')
books = pd.read_pickle('../Pickle/books.pkl')
reviews = pd.read_pickle('../Pickle/reviews.pkl')
user_most_common_genres = pd.read_pickle('../Pickle/user_most_common_genres.pkl')

In [3]:
user_most_common_genres

NameError: name 'user_most_common_genres' is not defined

In [4]:
# Get unique user IDs from all relevant DataFrames
review_user_ids = set(reviews['user_id'].unique())
read_user_ids = set(read['user_id'].unique())
common_user_ids = read_user_ids.intersection(review_user_ids)

In [6]:
user_features

Unnamed: 0_level_0,book_id,most_common_genres
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"[948, 947, 946, 945, 944, 943, 942, 941, 940, ...","[fiction, history, historical fiction, biography]"
1,"[1065, 1064, 1063, 1062, 1061, 1060, 1059, 105...","[fiction, non-fiction, history, historical fic..."
2,"[1227, 1226, 1225, 1224, 613, 898, 862, 858, 9...","[fiction, non-fiction, history, historical fic..."
3,"[1496, 1495, 1494, 1493, 1492, 1491, 1490, 119...","[history, historical fiction, biography, fiction]"
4,"[1633, 1632, 1631, 996, 1630, 1629, 1628, 1386...","[fiction, history, historical fiction, biography]"
...,...,...
876140,"[2407, 14707, 5770, 13114, 19962, 145555, 1595...","[fiction, poetry, fantasy, paranormal]"
876141,"[8449, 1049, 14854, 6144, 17044, 25121, 18035,...","[fiction, fantasy, paranormal, history]"
876142,"[739, 1203, 545, 15474, 810, 1007, 1003, 1065,...","[fiction, history, historical fiction, biography]"
876143,"[1572, 1002, 376, 1473, 16140, 52939, 619, 160...","[fiction, history, historical fiction, biography]"


In [7]:
# Train-test split
train_reviews, test_reviews = train_test_split(reviews[reviews['user_id'].isin(common_user_ids)], test_size=0.2, random_state=42)
train_users = train_reviews['user_id'].unique()
test_users = test_reviews['user_id'].unique()
train_read = read[read['user_id'].isin(train_users)]
test_read = read[read['user_id'].isin(test_users)]

In [8]:
from sklearn.preprocessing import normalize

# Fit MultiLabelBinarizer on the entire dataset
mlb_genres = MultiLabelBinarizer(sparse_output=True)
mlb_genres.fit(user_features['most_common_genres'])
mlb_books = MultiLabelBinarizer(sparse_output=True)
mlb_books.fit(user_features['book_id'])

In [9]:
# Convert data in batches
genre_data = lil_matrix((0, len(mlb_genres.classes_)))
book_data = lil_matrix((0, len(mlb_books.classes_)))
batch_size = 5000

for start in tqdm(range(0, len(user_features), batch_size), desc="Processing Batches"):
    end = min(start + batch_size, len(user_features))
    genre_batch = mlb_genres.transform(user_features['most_common_genres'][start:end])
    book_batch = mlb_books.transform(user_features['book_id'][start:end])
    genre_data = vstack([genre_data, genre_batch])
    book_data = vstack([book_data, book_batch])

# Normalize the genre and book data separately
genre_data_normalized = normalize(genre_data, norm='l2')
book_data_normalized = normalize(book_data, norm='l2')


Processing Batches: 100%|██████████| 176/176 [04:06<00:00,  1.40s/it]


In [10]:
book_data

<876145x2360650 sparse matrix of type '<class 'numpy.float64'>'
	with 228648342 stored elements in COOrdinate format>

In [11]:
genre_data

<876145x17 sparse matrix of type '<class 'numpy.float64'>'
	with 3038094 stored elements in COOrdinate format>

In [None]:
from annoy import AnnoyIndex
import numpy as np
from scipy.sparse import lil_matrix, csr_matrix, vstack
from tqdm import tqdm

# Function to create an Annoy index
def create_annoy_index(data, n_trees=10):
    f = data.shape[1]  # Number of features
    t = AnnoyIndex(f, 'angular')  # Angular distance metric

    for i in range(data.shape[0]):
        t.add_item(i, data[i].toarray()[0])

    t.build(n_trees)
    return t

# Function to compute similarity matrix using Annoy
def compute_annoy_similarity_matrix(data, description):
    n_users = data.shape[0]
    similarity_matrix = lil_matrix((n_users, n_users))

    annoy_index = create_annoy_index(data)

    with tqdm(total=n_users, desc=f"Calculating {description} Similarities") as pbar:
        for i in range(n_users):
            similar_items = annoy_index.get_nns_by_item(i, n_users, include_distances=True)
            for j, sim in zip(similar_items[0], similar_items[1]):
                similarity_matrix[i, j] = 1 / (1 + sim)  # Convert distance to similarity
            pbar.update(1)
    return similarity_matrix

# Function to compute overlap coefficient similarity for sparse data
def overlap_coefficient_sparse(data, other_data=None, epsilon=1e-10):
    if other_data is None:
        other_data = data
    intersections = data.dot(other_data.T)
    min_sums = data.sum(axis=1).A1[:, None] * other_data.sum(axis=1).A1[None, :]
    min_sums[min_sums == 0] = epsilon  # Add a small constant to avoid division by zero
    overlap_sim_matrix = intersections / min_sums
    return overlap_sim_matrix

# Process chunks of data for large matrices to avoid memory issues
def process_chunk(start_idx, end_idx, data, other_data, pbar):
    chunk_result = lil_matrix((end_idx - start_idx, other_data.shape[0]))
    for i in range(start_idx, end_idx):
        similarities = overlap_coefficient_sparse(data[i], other_data)
        chunk_result[i - start_idx, :] = similarities
        pbar.update(1)
    return chunk_result

# Compute similarity matrix in chunks
def compute_similarity_matrix_chunked(data, description):
    n_users = data.shape[0]
    chunk_size = 1000  # Reduce the chunk size to avoid memory issues

    similarity_matrix = lil_matrix((n_users, n_users))

    with tqdm(total=n_users, desc=f"Calculating {description} Similarities") as pbar:
        for start_idx in range(0, n_users, chunk_size):
            end_idx = min(start_idx + chunk_size, n_users)
            chunk_result = process_chunk(start_idx, end_idx, data, data, pbar)
            similarity_matrix[start_idx:end_idx, :] = chunk_result

    return similarity_matrix.tocsr()

# Calculate genre and book similarity matrices
genre_similarity_matrix = compute_similarity_matrix_chunked(genre_data_normalized, "Genre")
book_similarity_matrix = compute_similarity_matrix_chunked(book_data_normalized, "Book")

# Combine the similarity matrices by averaging them
combined_similarity = (genre_similarity_matrix + book_similarity_matrix) / 2

print(combined_similarity[:10, :10].toarray())


Calculating Genre Similarities:   0%|          | 369/876145 [02:47<110:40:53,  2.20it/s]


In [42]:
def recommend_books(user_id, user_similarity, user_ids, books, all_read, num_recommendations, include_read=False):
    if isinstance(user_ids, set):
        user_ids = list(user_ids)
    
    user_index_mapping = {user_id: index for index, user_id in enumerate(user_ids)}
    
    if user_id not in user_index_mapping:
        return pd.DataFrame(columns=['book_id', 'title'])

    user_index = user_index_mapping[user_id]
    user_similarity_sparse = user_similarity.getrow(user_index).toarray().flatten()
    
    sorted_indices = np.argsort(-user_similarity_sparse)[1:num_recommendations + 200]
    similar_user_ids = [user_ids[i] for i in sorted_indices if i < len(user_ids)]
    valid_similar_user_ids = set(similar_user_ids) & set(all_read['user_id'].unique())

    if not valid_similar_user_ids:
        return pd.DataFrame(columns=['book_id', 'title'])

    similar_users_books = all_read[all_read['user_id'].isin(valid_similar_user_ids) & (all_read['is_read'] == 1)]['book_id'].unique()

    if not include_read:
        user_books = all_read[(all_read['user_id'] == user_id) & (all_read['is_read'] == 1)]['book_id'].unique()
        recommended_books = np.setdiff1d(similar_users_books, user_books)
    else:
        recommended_books = similar_users_books

    if not recommended_books.size:
        return pd.DataFrame(columns=['book_id', 'title'])

    recommended_books_df = books[books['book_id'].isin(recommended_books)].head(num_recommendations)
    return recommended_books_df[['book_id', 'title']]


In [None]:
recommend_books(1, combined_similarity, common_user_ids, books, read, num_recommendations=30, include_read=True)

In [None]:
recommend_books(4, combined_similarity, common_user_ids, books, read, num_recommendations=30, include_read=True)

In [None]:
recommend_books(754, combined_similarity, common_user_ids, books, read, num_recommendations=30, include_read=True)

In [None]:
from tqdm import tqdm
import numpy as np

def evaluate_model(test_users, user_similarity, user_ids, books, test_read, k=5):
    precision_scores = []
    recall_scores = []
    ndcg_scores = []
    mrr_scores = []

    for user_id in tqdm(test_users):
        if user_id not in user_ids:
            continue

        actual_books = set(test_read[(test_read['user_id'] == user_id) & (test_read['is_read'] == 1)]['book_id'])

        recommended_books = recommend_books(user_id, user_similarity, user_ids, books, test_read, num_recommendations=k, include_read=True)['book_id'].tolist()

        if len(recommended_books) == 0:
            continue

        def precision_at_k(y_true, y_pred, k):
            y_true_set = set(y_true)
            y_pred_k = y_pred[:k]
            return len(set(y_pred_k) & y_true_set) / k if k > 0 else 0

        def recall_at_k(y_true, y_pred, k):
            y_true_set = set(y_true)
            y_pred_k = y_pred[:k]
            return len(set(y_pred_k) & y_true_set) / len(y_true_set) if len(y_true_set) > 0 else 0

        def ndcg_at_k(y_true, y_pred, k):
            def dcg(relevance_scores):
                return sum((2**rel - 1) / np.log2(idx + 2) for idx, rel in enumerate(relevance_scores))

            y_true_set = set(y_true)
            y_pred_k = y_pred[:k]
            relevance_scores = [1 if item in y_true_set else 0 for item in y_pred_k]
            ideal_relevance_scores = [1] * min(len(y_true), k) + [0] * (k - min(len(y_true), k))
            return dcg(relevance_scores) / dcg(ideal_relevance_scores) if len(ideal_relevance_scores) > 0 else 0

        def mrr_at_k(y_true, y_pred, k):
            y_true_set = set(y_true)
            for i, pred in enumerate(y_pred[:k]):
                if pred in y_true_set:
                    return 1 / (i + 1)
            return 0

        precision_scores.append(precision_at_k(actual_books, recommended_books, k))
        recall_scores.append(recall_at_k(actual_books, recommended_books, k))
        ndcg_scores.append(ndcg_at_k(actual_books, recommended_books, k))
        mrr_scores.append(mrr_at_k(actual_books, recommended_books, k))

    precision_avg = np.mean(precision_scores)
    recall_avg = np.mean(recall_scores)
    ndcg_avg = np.mean(ndcg_scores)
    mrr_avg = np.mean(mrr_scores)

    return precision_avg, recall_avg, ndcg_avg, mrr_avg

precision, recall, ndcg, mrr = evaluate_model(test_users, combined_similarity, common_user_ids, books, test_read, k=10)
print(f"Precision@K: {precision:.4f}, Recall@K: {recall:.4f}, NDCG@K: {ndcg:.4f}, MRR@K: {mrr:.4f}")