In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
from tqdm import tqdm
import dask.array as da
from dask_ml.decomposition import PCA
import faiss
import os
tqdm.pandas()
pd.options.display.max_colwidth = 200

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load data
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')

In [3]:
books = books.sample(50000, random_state=42)

In [4]:
# Function to drop empty rows
def drop_empty_rows(df, column_name):
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')
books = books.reset_index(drop=True)

In [5]:
# Identify valid book IDs present in both dataframes
valid_book_ids = set(read['book_id']).intersection(set(books['book_id']))

# Filter the books dataframe
books_filtered = books[books['book_id'].isin(valid_book_ids)]

# Filter the read dataframe
read_filtered = read[read['book_id'].isin(valid_book_ids)]


In [6]:
books['combined_features'] = books.apply(
    lambda row: f"{row['title']} by {row['authors']}, " +
                f"Description: {row['description']}, " +
                f"Shelves: {row['expanded_shelves']}",
    axis=1
)

In [7]:
books = books.reset_index(drop=True)
read = read.reset_index(drop=True)

In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')



only have to compute an embedding once

In [9]:

# Function to periodically save the embeddings to a separate file
def save_embeddings_incrementally(books_df, model, interval=100):
    embeddings_file = 'Pickle/embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'embeddings'])
    
    for i in tqdm(range(len(books_df))):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(books_df.at[i, 'combined_features'])
        new_row = pd.DataFrame({'index': [i], 'embeddings': [embedding]})
        embeddings_df = pd.concat([embeddings_df, new_row], ignore_index=True)
        
        if i % interval == 0:
            embeddings_df.to_pickle(embeddings_file)
    
    # Save the final version
    embeddings_df.to_pickle(embeddings_file)

# Save embeddings incrementally
save_embeddings_incrementally(books, model, interval=100)


100%|██████████| 50000/50000 [01:22<00:00, 608.73it/s]


In [11]:
# Load the incremental embeddings
embeddings_df = pd.read_pickle('Pickle/embeddings.pkl')

# Merge embeddings back into the original DataFrame
books['embeddings'] = None
for i in tqdm(range(len(books))):
    if i in embeddings_df['index'].values:
        books.at[i, 'embeddings'] = embeddings_df[embeddings_df['index'] == i]['embeddings'].values[0]

100%|██████████| 50000/50000 [05:59<00:00, 139.10it/s]


In [55]:
embedding_matrix = np.vstack(books['embeddings'].values)
embedding_matrix_dask = da.from_array(embedding_matrix, chunks=(1000, 300))

In [56]:
# reduce dimensionality
pca = PCA(n_components=50)
reduced_embeddings = pca.fit_transform(embedding_matrix_dask)

In [57]:
reduced_embeddings = reduced_embeddings.compute()

In [58]:
normalized_embeddings = normalize(reduced_embeddings, axis=1)

In [59]:
index = faiss.IndexFlatIP(normalized_embeddings.shape[1])
index.add(normalized_embeddings)

In [60]:
cosine_sim = cosine_similarity(normalized_embeddings, normalized_embeddings)

In [61]:
train_books, test_books = train_test_split(books, test_size=0.2, random_state=42)

In [62]:
train_embedding_matrix = embedding_matrix[train_books.index]

In [80]:
test_embedding_matrix = embedding_matrix[test_books.index]

In [63]:
cosine_sim_train = cosine_similarity(train_embedding_matrix, train_embedding_matrix)

In [83]:
cosine_sim_test = cosine_similarity(test_embedding_matrix, test_embedding_matrix)

In [84]:
# Function to get recommendations for a given book_id based on the training set
def get_recommendations(book_id, train_books_df, train_cosine_sim_matrix, top_n=5):
    if book_id not in train_books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the training books dataframe.")
        return pd.DataFrame(columns=['title', 'authors', 'book_id'])

    book_idx = train_books_df[train_books_df['book_id'] == book_id].index[0]
    sim_scores = list(enumerate(train_cosine_sim_matrix[book_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_book_ids = [train_books_df['book_id'].iloc[i[0]] for i in sim_scores[1:top_n+1]]
    top_books = train_books_df[train_books_df['book_id'].isin(top_book_ids)]

    return top_books[['title', 'authors', 'book_id']]

In [85]:
books['book_id']

0           57080
1        35905367
2        21798963
3        12337584
4          531064
           ...   
49995    29430435
49996      937779
49997      767765
49998      406581
49999    29858198
Name: book_id, Length: 50000, dtype: int64

In [86]:
get_recommendations(231, books, cosine_sim)

Unnamed: 0,title,authors,book_id
23801,"Child's Mind: Mindfulness Practices to Help Our Children Be More Focused, Calm, and Relaxed",[Christopher Willard],8932852
32678,America's War for the Greater Middle East,[Andrew J. Bacevich],27994395
36400,Pomegranate Heart,[Miriam Calleja],25531013
38062,"Stone of Farewell (Memory, Sorrow, and Thorn, #2)",[Tad Williams],1788163
41033,Japan 1941: Countdown to Infamy,[Eri Hotta],17345183


In [87]:
get_recommendations(421, books, cosine_sim)

Unnamed: 0,title,authors,book_id
19146,Edge of Dark Water,[Joe R. Lansdale],11641612
19147,Breathers: A Zombie's Lament,[S.G. Browne],6568158
37952,Jennifer Crusie Bundle: Welcome to Temptation/ Fast Women/ Faking It,[Jennifer Crusie],5096123
42438,The Rainbow Dragon: A Narnia Story,[Hiawyn Oram],34558432
48925,The Way Back To Us,[Kay Langdale],35382450
