In [None]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
DIR = 'Data'

In [2]:
interactions_csv = pd.read_csv('Data/goodreads_interactions.csv', header = 0)

In [3]:
file_path = os.path.join(DIR, 'goodreads_books.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

books = pd.concat(df_list, ignore_index=True)

In [4]:
file_path = os.path.join(DIR, 'goodreads_book_genres_initial.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

genres = pd.concat(df_list, ignore_index=True)

In [5]:
file_path = os.path.join(DIR, 'goodreads_reviews_dedup.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

reviews = pd.concat(df_list, ignore_index=True)

In [6]:
file_path = os.path.join(DIR, 'goodreads_interactions_dedup.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

interactions = pd.concat(df_list, ignore_index=True)

In [7]:
def remove_blank_rows(df, column_name):
    df_cleaned = df[df[column_name].str.strip() != '']
    return df_cleaned

books = remove_blank_rows(books, 'description')

In [8]:
def extract_genres(genre_dict):
    return list(genre_dict.keys())

genres['genre_names'] = genres['genres'].apply(extract_genres)

In [9]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [10]:
books = books.copy()
books = books[['description', 'title', 'authors', 'book_id']]
books = pd.merge(books, genres, on='book_id', how='left')

In [11]:
def combine_description_and_genres(row):
    genres_str = ' '.join(row['genre_names'])
    return f"{row['description']} {genres_str}"

books['combined_text'] = books.apply(combine_description_and_genres, axis=1)

In [12]:
books.reset_index(drop=True, inplace=True)
interactions_csv.reset_index(drop=True, inplace=True)

In [13]:
data = pd.merge(interactions, books, on='book_id')
data = pd.merge(data, reviews, on=['user_id', 'book_id'])

In [None]:
data.columns

In [15]:
# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data_surprise = Dataset.load_from_df(data[['user_id', 'book_id', 'rating_x']], reader)

In [16]:
# Split data into train and test sets
trainset, testset = train_test_split(data_surprise, test_size=0.2)

In [None]:
# Build and train the model
svd = SVD()
svd.fit(trainset)

In [18]:
# Function to recommend books
def recommend_books_surprise(user_id, num_recommendations=5):
    user_books = data[data['user_id'] == user_id]['book_id'].unique()
    all_books = data['book_id'].unique()
    books_to_predict = [book for book in all_books if book not in user_books]

    predictions = [svd.predict(user_id, book_id) for book_id in books_to_predict]
    predictions = sorted(predictions, key=lambda x: x.est, reverse=True)

    recommended_book_ids = [pred.iid for pred in predictions[:num_recommendations]]
    recommended_books = books[books['book_id'].isin(recommended_book_ids)]['title'].tolist()
    return recommended_books

In [None]:
user_id = 1234
surprise_recommendations = recommend_books_surprise(user_id)
print(surprise_recommendations)
