In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import os
tqdm.pandas()
pd.options.display.max_colwidth = 200

  from tqdm.autonotebook import tqdm, trange


In [2]:
# Load data
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')

In [3]:
# Function to drop empty rows
def drop_empty_rows(df, column_name):
    df_cleaned = df.dropna(subset=[column_name])
    return df_cleaned

books = drop_empty_rows(books, 'description')
books = books.reset_index(drop=True)

In [4]:
# Identify valid book IDs present in both dataframes
valid_book_ids = set(read['book_id']).intersection(set(books['book_id']))

# Filter the books dataframe
books_filtered = books[books['book_id'].isin(valid_book_ids)]

# Filter the read dataframe
read_filtered = read[read['book_id'].isin(valid_book_ids)]


In [5]:
books['combined_features'] = books.apply(lambda row: f"{row['title']} by {row['authors']}, Description: {row['description']}, Shelves: {row['expanded_shelves']}", axis=1)

In [6]:
books = books.reset_index(drop=True)
read = read.reset_index(drop=True)

In [7]:
model = SentenceTransformer('all-MiniLM-L6-v2')




In [13]:

# Function to periodically save the embeddings to a separate file
def save_embeddings_incrementally(books_df, model, interval=100):
    embeddings_file = 'Pickle/embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'embeddings'])
    
    for i in tqdm(range(len(books_df))):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(books_df.at[i, 'combined_features'])
        new_row = pd.DataFrame({'index': [i], 'embeddings': [embedding]})
        embeddings_df = pd.concat([embeddings_df, new_row], ignore_index=True)
        
        if i % interval == 0:
            embeddings_df.to_pickle(embeddings_file)
    
    # Save the final version
    embeddings_df.to_pickle(embeddings_file)

# Save embeddings incrementally
save_embeddings_incrementally(books, model, interval=100)


100%|██████████| 60298/60298 [57:39<00:00, 17.43it/s]  


In [14]:
# Load the incremental embeddings
embeddings_df = pd.read_pickle('Pickle/embeddings.pkl')

# Merge embeddings back into the original DataFrame
books['embeddings'] = None
for i in tqdm(range(len(books))):
    if i in embeddings_df['index'].values:
        books.at[i, 'embeddings'] = embeddings_df[embeddings_df['index'] == i]['embeddings'].values[0]

# Save the final DataFrame with embeddings
books.to_pickle('Pickle/books.pkl')


100%|██████████| 60298/60298 [03:30<00:00, 286.85it/s]


In [15]:
embedding_matrix = np.vstack(books['embeddings'].values)

In [16]:
# Reduce dimensionality
pca = PCA(n_components=50)
embedding_matrix = pca.fit_transform(embedding_matrix)
cosine_sim = cosine_similarity(embedding_matrix, embedding_matrix)

In [17]:
train_books, test_books = train_test_split(books, test_size=0.2, random_state=42)

In [18]:
train_embedding_matrix = embedding_matrix[train_books.index]

In [19]:
cosine_sim_train = cosine_similarity(train_embedding_matrix, train_embedding_matrix)

In [20]:
# Function to get recommendations for a given book_id based on the training set
def get_recommendations(book_id, train_books_df, train_cosine_sim_matrix, top_n=5):
    if book_id not in train_books_df['book_id'].values:
        print(f"Book ID {book_id} not found in the training books dataframe.")
        return pd.DataFrame(columns=['title', 'authors', 'book_id'])

    book_idx = train_books_df[train_books_df['book_id'] == book_id].index[0]
    sim_scores = list(enumerate(train_cosine_sim_matrix[book_idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_book_ids = [train_books_df['book_id'].iloc[i[0]] for i in sim_scores[1:top_n+1]]
    top_books = train_books_df[train_books_df['book_id'].isin(top_book_ids)]

    return top_books[['title', 'authors', 'book_id']]

In [21]:
books['book_id']

0             231
1             421
2             422
3             423
4             426
           ...   
60293    36470448
60294    36471270
60295    36478574
60296    36484555
60297    36488099
Name: book_id, Length: 60298, dtype: int64

In [22]:
get_recommendations(231, books, cosine_sim)

Unnamed: 0,title,authors,book_id
10654,Pretty in Plaid,[Jen Lancaster],5093760
12567,Bertha Fights Back,[Fran Lewis],6798045
33802,American Blonde,[Jennifer Niven],19731977
45117,Waveland,[Simone Zelitch],25518836
52552,Piecing Me Together,[Renee Watson],30038963


In [23]:
get_recommendations(421, books, cosine_sim)

Unnamed: 0,title,authors,book_id
2244,The Death of Rhythm and Blues,[Nelson George],265062
6084,The Motown Album: The Sound of Young America,"[Ben Fong-Torres, Elvis Mitchell, Berry Gordy, Dave Marsh]",1145265
7765,One Nation Under A Groove: Motown & American Culture,[Gerald Early],1891906
13838,Hard Bop: Jazz & black music 1955-1965,[David H. Rosenthal],7802688
39690,It's a Black-White Thing,[Donna Bryson],23110590


In [24]:
get_recommendations(422, books, cosine_sim)

Unnamed: 0,title,authors,book_id
3,Where I Was From,[Joan Didion],423
938,Alias Grace,[Margaret Atwood],72579
5963,Joan of Arc,[Marina Warner],1117661
26059,On the Government of God,"[Salvian, Eva Sanford]",17181811
35522,The Book of Unknown Americans,[Cristina Henriquez],20946053


In [25]:
get_recommendations(423, books, cosine_sim)

Unnamed: 0,title,authors,book_id
751,Eudora: A Writer's Life,[Ann Waldron],52953
4694,"The Limits of Racial Domination: Plebeian Society in Colonial Mexico City, 1660-1720",[R. Douglas Cope],814566
10206,Flawed Liberation: Socialism & Feminism (Contributions in Women's Studies 19),[Sally M. Miller],4006181
15445,Letters from America,"[Alexis de Tocqueville, Frederick Brown]",9296280
32842,A Woman's Place,[Lynn Austin],18954955


In [32]:
from tqdm import tqdm

def evaluate_recommendations_on_test(test_books_df, train_books_df, train_cosine_sim_matrix, top_n=5):
    total_precision = 0
    total_recall = 0
    hit_count = 0
    total_books = 0  # Track the number of evaluated books

    # Use tqdm for progress monitoring
    for book_id in tqdm(test_books_df['book_id'].unique(), desc="Evaluating", unit="book"):
        if book_id not in train_books_df['book_id'].values:
            continue  # Skip if book_id not in training set

        recs = get_recommendations(book_id, train_books_df, train_cosine_sim_matrix, top_n)
        if recs.empty:  # Skip if no recommendations
            continue

        recommended_items = recs['book_id'].tolist()
        relevant_items = test_books_df['book_id'].tolist()

        relevant_recommendations = [rec for rec in recommended_items if rec in relevant_items]
        precision = len(relevant_recommendations) / len(recommended_items) if recommended_items else 0
        recall = len(relevant_recommendations) / len(relevant_items) if relevant_items else 0

        total_precision += precision
        total_recall += recall
        hit_count += 1 if any(rec in relevant_items for rec in recommended_items) else 0
        total_books += 1  # Increment the count of evaluated books

    precision = total_precision / total_books if total_books > 0 else 0
    recall = total_recall / total_books if total_books > 0 else 0
    hit_rate = hit_count / total_books if total_books > 0 else 0

    return precision, recall, hit_rate



In [33]:
evaluate_recommendations_on_test(test_books, books, cosine_sim)

Evaluating: 100%|██████████| 12060/12060 [08:04<00:00, 24.91book/s]


(0.20155887230514904, 8.356503826913759e-05, 0.6748756218905473)