In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
tqdm.pandas()
from surprise import Reader, Dataset, SVD
svd = SVD()

In [None]:
read = pd.read_pickle('Pickle/read.pkl')
books = pd.read_pickle('Pickle/books.pkl')
reviews = pd.read_pickle('Pickle/reviews.pkl')
interactions = pd.read_pickle('Pickle/interactions.pkl')    

In [None]:
interactions = interactions[interactions['is_read']== True]

In [None]:
reviews = reviews.dropna(subset=['review_text', 'rating'])

In [None]:
reviews = reviews.sample(10000, random_state=42)

In [None]:
import os

model = SentenceTransformer('all-MiniLM-L6-v2')

def save_embeddings_incrementally(reviews_df, model, interval=100):
    embeddings_file = 'Pickle/review_embeddings.pkl'
    
    if os.path.exists(embeddings_file):
        embeddings_df = pd.read_pickle(embeddings_file)
    else:
        embeddings_df = pd.DataFrame(columns=['index', 'embeddings'])
    
    reviews_df = reviews_df.dropna(subset=['review_text']).reset_index(drop=True)

    for i in tqdm(range(len(reviews_df))):
        if i in embeddings_df['index'].values:
            continue  # Skip if already processed
        
        embedding = model.encode(reviews_df.loc[i, 'review_text'])
        new_row = pd.DataFrame({'index': [i], 'embeddings': [embedding]})
        embeddings_df = pd.concat([embeddings_df, new_row], ignore_index=True)
        
        if i % interval == 0:
            embeddings_df.to_pickle(embeddings_file)
    
    # Save the final version
    embeddings_df.to_pickle(embeddings_file)

# Save embeddings incrementally
save_embeddings_incrementally(reviews, model, interval=100)


In [None]:
# Load the incremental embeddings
embeddings_df = pd.read_pickle('Pickle/review_embeddings.pkl')

# Ensure the reviews DataFrame has a proper index
reviews.reset_index(drop=True, inplace=True)

# Initialize the embeddings column in reviews DataFrame
reviews['embeddings'] = None

# Merge embeddings back into the reviews DataFrame
for i in tqdm(range(len(reviews))):
    if i in embeddings_df['index'].values:
        embedding = embeddings_df.loc[embeddings_df['index'] == i, 'embeddings'].values[0]
        reviews.at[i, 'embeddings'] = embedding

# Ensure all embeddings are numpy arrays
def convert_to_array(x):
    if isinstance(x, list):
        return np.array(x)
    return x

reviews['embeddings'] = reviews['embeddings'].apply(convert_to_array)

In [None]:
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

#surprise dataset
reader = Reader()
data = Dataset.load_from_df(interactions[['user_id', 'book_id', 'rating']], reader)

#training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

#parameter grid
param_grid = {
    'n_epochs': [5, 7],
    'lr_all': [0.007, 0.005],
    'reg_all': [0.005, 0.01],
    'n_factors': [5, 7]
}

gs = GridSearchCV(SVD, param_grid, measures=['RMSE', 'MAE', 'MSE', 'FCP'], cv=5)
gs.fit(data)

In [None]:
best_svd = gs.best_estimator['rmse']

In [None]:
cross_validate(best_svd, data, measures=['RMSE', 'MAE', 'MSE', 'FCP'], cv=5, verbose=True)

In [None]:
best_svd.fit(trainset)

In [None]:
predictions = best_svd.test(testset)

In [50]:
from surprise import accuracy
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
fcp = accuracy.fcp(predictions)
mse = accuracy.mse(predictions)

RMSE: 1.1631
MAE:  0.8478
FCP:  0.7746
MSE: 1.3529


In [None]:
interactions[['user_id', 'book_id']]

In [51]:
best_svd.predict(0,21).est

4.170020802688159