In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
from tqdm import tqdm
import dask.array as da
from dask_ml.decomposition import PCA
import faiss
import os
tqdm.pandas()
pd.options.display.max_colwidth = 200

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load data
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')

In [4]:
books = books.sample(60000, random_state=42)

In [5]:
# Identify valid book IDs present in both dataframes
valid_book_ids = set(read['book_id']).intersection(set(books['book_id']))

# Filter the books dataframe
books_filtered = books[books['book_id'].isin(valid_book_ids)]

# Filter the read dataframe
read_filtered = read[read['book_id'].isin(valid_book_ids)]


In [6]:
books = books.reset_index(drop=True)
read = read.reset_index(drop=True)

In [7]:
embedding_matrix = np.vstack(books['embeddings'].values)
embedding_matrix_dask = da.from_array(embedding_matrix, chunks=(1000, 300))

In [8]:
# reduce dimensionality
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embedding_matrix_dask)

In [9]:
reduced_embeddings = reduced_embeddings.compute()

In [10]:
normalized_embeddings = normalize(reduced_embeddings, axis=1)

In [11]:
index = faiss.IndexFlatIP(normalized_embeddings.shape[1])
index.add(normalized_embeddings)

In [12]:
cosine_sim = cosine_similarity(normalized_embeddings, normalized_embeddings)

In [13]:
train_books, test_books = train_test_split(books, test_size=0.2, random_state=42)

In [14]:
train_embedding_matrix = embedding_matrix[train_books.index]

In [15]:
test_embedding_matrix = embedding_matrix[test_books.index]

In [16]:
cosine_sim_train = cosine_similarity(train_embedding_matrix, train_embedding_matrix)

In [17]:
cosine_sim_test = cosine_similarity(test_embedding_matrix, test_embedding_matrix)

In [18]:
# Function to get recommendations for a given book_id based on the training set
def get_recommendations(book_id, train_books_df, train_cosine_sim_matrix, top_n=5):
    if book_id not in train_books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the training books dataframe.")
        return pd.DataFrame(columns=['title', 'authors', 'book_id'])

    book_idx = train_books_df[train_books_df['book_id'] == book_id].index[0]
    sim_scores = list(enumerate(train_cosine_sim_matrix[book_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_book_ids = [train_books_df['book_id'].iloc[i[0]] for i in sim_scores[1:top_n+1]]
    top_books = train_books_df[train_books_df['book_id'].isin(top_book_ids)]

    return top_books[['title', 'authors', 'book_id']]