In [None]:
import threadpoolctl
threadpoolctl.threadpool_limits(1, "blas")
from surprise import accuracy
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from implicit import als

In [None]:
read = pd.read_pickle('../Pickle/read.pkl')

In [None]:
read = read[read['is_read']== 1]

In [None]:
read = read.drop(columns=["rating", "is_reviewed"], errors="ignore")

In [None]:
read = read.reset_index(drop = True)

In [None]:
from sklearn.model_selection import train_test_split

def split_data_with_single_interactions(read):
    # Identify users and books that appear only once in the dataset
    user_counts = read['user_id'].value_counts()
    book_counts = read['book_id'].value_counts()

    # Find interactions where user or book appears only once
    single_interactions = read[
        read['user_id'].isin(user_counts[user_counts == 1].index) | 
        read['book_id'].isin(book_counts[book_counts == 1].index)
    ]

    # Remove those interactions from the main dataset
    remaining_interactions = read[~read.index.isin(single_interactions.index)]

    # Split the remaining interactions into train and test using sklearn's train_test_split
    train_df, test_df = train_test_split(remaining_interactions, test_size=0.2, random_state=42)

    # Add the single interactions to the training set
    train_df = pd.concat([train_df, single_interactions], ignore_index=True)


    return train_df, test_df

train_df, test_df = split_data_with_single_interactions(read)

In [None]:
# Step 2: Assign categorical indices to users and books
user_cat = pd.Categorical(read["user_id"])
book_cat = pd.Categorical(read["book_id"])

read["user_idx"] = user_cat.codes
read["book_idx"] = book_cat.codes

train_df["user_idx"] = pd.Categorical(train_df["user_id"], categories=user_cat.categories).codes
train_df["book_idx"] = pd.Categorical(train_df["book_id"], categories=book_cat.categories).codes

test_df["user_idx"] = pd.Categorical(test_df["user_id"], categories=user_cat.categories).codes
test_df["book_idx"] = pd.Categorical(test_df["book_id"], categories=book_cat.categories).codes

# Step 3: Combine all unique book IDs from both train and test datasets
all_book_ids = pd.concat([train_df['book_id'], test_df['book_id']]).unique()

# Step 4: Create a mapping from book IDs to unique indices
book_id_to_index = {book_id: idx for idx, book_id in enumerate(all_book_ids)}

# Step 5: Map book IDs to indices in both train and test datasets
train_df['book_idx'] = train_df['book_id'].map(book_id_to_index)
test_df['book_idx'] = test_df['book_id'].map(book_id_to_index)

In [None]:
# Step 6: Create the train and test matrices (binary data: 1 for read, 0 for not read)
train_matrix = csr_matrix((train_df["is_read"], (train_df["user_idx"], train_df["book_idx"])))
test_matrix = csr_matrix((test_df["is_read"], (test_df["user_idx"], test_df["book_idx"])))

In [None]:
# Create a mapping for user_id and book_id to indices
user_mapping = {user_id: idx for idx, user_id in enumerate(read['user_id'].unique())}
book_mapping = {book_id: idx for idx, book_id in enumerate(read['book_id'].unique())}

# Map the original user_id and book_id to their respective indices
read['user_idx'] = read['user_id'].map(user_mapping)
read['book_idx'] = read['book_id'].map(book_mapping)

from scipy.sparse import csr_matrix

# Create the sparse CSR matrix directly
interaction_matrix_sparse_csr = csr_matrix(
    (read['is_read'], (read['user_idx'], read['book_idx'])),
    shape=(len(user_mapping), len(book_mapping))
)


In [None]:
# Check number of rows in the sparse matrix
num_rows = interaction_matrix_sparse_csr.shape[0]
num_unique_users = len(user_mapping)

print(f"Number of rows in the sparse matrix: {num_rows}")
print(f"Number of unique users: {num_unique_users}")

# Check if each user has exactly one row in the matrix (user_idx should be unique)
unique_user_indices = read['user_idx'].nunique()
print(f"Number of unique user indices: {unique_user_indices}")

In [None]:
read[['user_id', 'user_idx']].drop_duplicates().sample(10)

In [None]:
read[['book_id', 'book_idx']].drop_duplicates().sample(10)

In [None]:
# Initialize the ALS model
model = als.AlternatingLeastSquares(
    factors=100,  # Number of latent factors
    regularization=0.1,  # Regularization strength
    iterations=100 , # Number of iterations
    alpha = 40
)

# Fit the model
model.fit(train_matrix)

In [None]:
from sklearn.preprocessing import normalize
model.user_factors = normalize(model.user_factors)
model.item_factors = normalize(model.item_factors)

In [None]:
# Extract the row corresponding to the user
user_row = interaction_matrix_sparse_csr[1904]

# Use the model to recommend items based on this single user's interaction row
recommendations = model.recommend(1904, user_row, N=10)

In [None]:
# Get recommended book IDs and confidence scores
recommended_books, scores = recommendations  

# Print results
for book, score in zip(recommended_books, scores):
    print(f"Book ID: {book}, Confidence Score: {score:.6f}")


In [None]:
item_idx = 46803  # Replace with the item ID (book) you want to explain

# Call explain method to get explanations for the recommendation
explanation = model.explain(1904, interaction_matrix_sparse_csr, item_idx, N=10)

In [None]:
# Extract item IDs and weights from the explanation
explanation_items = explanation[1]  # This is the list of (item_id, weight) tuples

# Iterate through the explanation and print item IDs and weights
print(f"Explanation for item {item_idx} recommended to user {1904}:")
for item, weight in explanation_items:
    print(f"Item {item} with weight {weight}")

In [None]:
import faiss
from tqdm import tqdm
from sklearn.metrics import roc_auc_score, precision_score, recall_score

def fast_evaluate(model, test_matrix, top_k, sample_size):
    user_factors = model.user_factors
    item_factors = model.item_factors

    num_users = test_matrix.shape[0]
    faiss.normalize_L2(user_factors)
    # Normalize item factors for cosine similarity (only if not already normalised)
    faiss.normalize_L2(item_factors)

    # Sample users for fast evaluation
    sampled_users = np.random.choice(num_users, size=int(num_users * sample_size), replace=False)

    # Use FAISS for fast top-K retrieval
    index = faiss.IndexFlatIP(item_factors.shape[1])  # Inner product index
    index.add(item_factors)  # Add item embeddings
    scores, top_k_items = index.search(user_factors[sampled_users], k=top_k)  # Get top-K per user

    all_predictions = []
    all_true_labels = []

    # Process sampled users
    for user in tqdm(sampled_users, desc="Evaluating users", total=len(sampled_users), dynamic_ncols=True, leave=True):
        true_items = set(test_matrix[user].nonzero()[0])  # Items user interacted with
        pred_items = top_k_items[sampled_users.tolist().index(user)]  # Top-K predicted items

        true_labels = [1 if item in true_items else 0 for item in pred_items]
        all_predictions.extend(scores[sampled_users.tolist().index(user)])  # Use actual FAISS scores
        all_true_labels.extend(true_labels)

    # Compute metrics
    precision_at_k = sum(all_true_labels) / (len(sampled_users) * top_k)
    recall_at_k = sum(all_true_labels) / test_matrix[sampled_users].nnz
    auc_score = roc_auc_score(all_true_labels, all_predictions)

    print(f'Precision@{top_k}: {precision_at_k:.4f}')
    print(f'Recall@{top_k}: {recall_at_k:.4f}')
    print(f'AUC: {auc_score:.4f}')

    return precision_at_k, recall_at_k, auc_score


In [None]:
precision_at_k, recall_at_k, auc_score = fast_evaluate(model, test_matrix, top_k=5, sample_size=1)