In [None]:
import pandas as pd
import numpy as np
import pickle
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
from math import log2

def load_data(file_path):
    return pd.read_pickle(file_path)

def preprocess_data(df):
    users_with_only_zeros = df.groupby('user_id')['rating'].apply(lambda x: (x == 0).all())
    users_with_only_zeros = users_with_only_zeros[users_with_only_zeros].index
    df = df[~df['user_id'].isin(users_with_only_zeros)]
    
    rated_books = df[df['is_read'] != 0]
    rated_books = rated_books[rated_books['rating'] != 0].reset_index(drop=True)
    return rated_books

def split_data(df):
    user_counts = df['user_id'].value_counts()
    book_counts = df['book_id'].value_counts()
    single_interactions = df[df['user_id'].isin(user_counts[user_counts == 1].index) |
                             df['book_id'].isin(book_counts[book_counts == 1].index)]
    remaining_interactions = df[~df.index.isin(single_interactions.index)]
    train_df, test_df = train_test_split(remaining_interactions, test_size=0.2, random_state=42, stratify=remaining_interactions['rating'])
    train_df = pd.concat([train_df, single_interactions], ignore_index=True)
    return train_df, test_df

def upsample_ratings(train_df):
    rating_counts = train_df['rating'].value_counts()
    majority_count = rating_counts.max()
    modified_dfs = []
    for rating, count in rating_counts.items():
        class_df = train_df[train_df['rating'] == rating]
        num_duplicates = int(majority_count / count)
        duplicated_df = pd.concat([class_df] * num_duplicates, ignore_index=True)
        duplicated_df['rating'] = duplicated_df['rating'] + np.random.uniform(-0.1, 0.1, size=duplicated_df.shape[0])
        duplicated_df['rating'] = duplicated_df['rating'].clip(1, 5)
        modified_dfs.append(duplicated_df)
    return pd.concat(modified_dfs)

def normalize_ratings(train_df, test_df):
    min_rating = train_df['rating'].min()
    train_df['rating'] = np.log1p(train_df['rating'])
    test_df['rating'] = np.log1p(test_df['rating'])
    return train_df, test_df, min_rating

def denormalize_rating(log_scaled_ratings, min_rating=0):
    original_ratings = np.expm1(np.asarray(log_scaled_ratings, dtype=float))
    return np.clip(original_ratings + min_rating, 0, 5)

def save_model(model, filename):
    with open(filename, 'wb') as f:
        pickle.dump(model, f)

def load_model(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

def predict_batches(model, test_df, batch_size=5000):
    reader = Reader(rating_scale=(test_df['rating'].min(), test_df['rating'].max()))
    test_data = Dataset.load_from_df(test_df[['user_id', 'book_id', 'rating']], reader)
    testset = test_data.build_full_trainset().build_testset()
    predictions = []
    for i in range(0, len(testset), batch_size):
        batch = testset[i:i+batch_size]
        predictions.extend(model.test(batch))
    return predictions

def precision_recall_ndcg_at_k(predictions, k=5, threshold=4.5):
    def dcg_at_k(scores):
        return sum((2**rel - 1) / log2(idx + 2) for idx, rel in enumerate(scores))
    
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions, recalls, ndcgs = [], [], []
    for user_ratings in user_est_true.values():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum(true_r >= threshold for _, true_r in user_ratings)
        n_rec_k = sum(est >= threshold for est, _ in user_ratings[:k])
        n_rel_and_rec_k = sum((true_r >= threshold) and (est >= threshold) for est, true_r in user_ratings[:k])
        precision = n_rel_and_rec_k / n_rec_k if n_rec_k > 0 else 0
        recall = n_rel_and_rec_k / n_rel if n_rel > 0 else 0
        actual = [true_r for _, true_r in user_ratings[:k]]
        ideal = sorted([true_r for _, true_r in user_ratings], reverse=True)[:k]
        ndcg = dcg_at_k(actual) / dcg_at_k(ideal) if dcg_at_k(ideal) > 0 else 0
        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
    return np.mean(precisions), np.mean(recalls), np.mean(ndcgs)

def plot_histogram(data, title, xlabel):
    plt.figure(figsize=(8, 6))
    plt.hist(data, bins=20, color='skyblue', edgecolor='black')
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel('Frequency')
    plt.show()

def plot_residuals(true_ratings, estimated_ratings):
    residuals = true_ratings - estimated_ratings
    plt.figure(figsize=(8, 6))
    plt.scatter(true_ratings, residuals, alpha=0.5, edgecolors='k')
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel("True Ratings")
    plt.ylabel("Residuals")
    plt.title("Residual Plot")
    plt.grid(True)
    plt.show()


In [None]:
df = load_data('../Pickle/read.pkl')

In [None]:
df = preprocess_data(df)

In [None]:
train_df, test_df = split_data(df)

In [None]:
train_df = upsample_ratings(train_df)

In [None]:
train_df, test_df, min_rating = normalize_ratings(train_df, test_df)

In [None]:
from scipy.sparse import coo_matrix
from surprise import Dataset, Reader

ratings = coo_matrix((df['rating'], (df['user_id'], df['book_id'])))

MemoryError: 

In [None]:
# Create a Dataset from the sparse matrix
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'book_id', 'rating']], reader)
trainset = data.build_full_trainset()


In [None]:
model = SVD(n_factors=200, n_epochs=60, lr_all=0.01, reg_all=0.001)

NameError: name 'data' is not defined

In [None]:
for epoch in range(1, 61):
    model.fit(trainset)
    print(f"Epoch {epoch}/{60} completed.")

In [None]:
save_model(model, '../Pickle/svd_model.pkl')

In [None]:
predictions = predict_batches(model, test_df)

In [None]:
precision, recall, ndcg = precision_recall_ndcg_at_k(predictions)
print(f'Precision@5: {precision:.4f}, Recall@5: {recall:.4f}, nDCG@5: {ndcg:.4f}')