In [1]:
import pandas as pd
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import faiss
from sklearn.decomposition import PCA


import nltk
nltk.download('all')

pd.set_option('display.max_columns', 1000000)
pd.set_option('display.width', 200000)
pd.set_option('display.max_rows', 2000000)
pd.set_option('max_colwidth', 1000000)
DIR = 'Data'


  from tqdm.autonotebook import tqdm, trange
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     C:\Users\maddy\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     

In [None]:
interactions_csv = pd.read_csv('Data/goodreads_interactions.csv', header = 0)

In [3]:
file_path = os.path.join(DIR, 'goodreads_books.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

books = pd.concat(df_list, ignore_index=True)

In [4]:
file_path = os.path.join(DIR, 'goodreads_book_genres_initial.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

genres = pd.concat(df_list, ignore_index=True)

In [5]:
file_path = os.path.join(DIR, 'goodreads_reviews_dedup.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

reviews = pd.concat(df_list, ignore_index=True)

In [6]:
file_path = os.path.join(DIR, 'goodreads_interactions_dedup.json.gz')

chunk_size = 1000
num_chunks = 100
chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size, compression='gzip')

df_list = []

for i, chunk in enumerate(chunks):
    if i >= num_chunks:
        break
    df_list.append(chunk)

interactions = pd.concat(df_list, ignore_index=True)

In [7]:
def remove_blank_rows(df, column_name):
    df_cleaned = df[df[column_name].str.strip() != '']
    return df_cleaned

books = remove_blank_rows(books, 'description')

In [8]:
def extract_genres(genre_dict):
    return list(genre_dict.keys())

genres['genre_names'] = genres['genres'].apply(extract_genres)

In [9]:
genres = genres.sort_values(by='book_id')
books = books.sort_values(by='book_id')

In [10]:
books = books.copy()
books = books[['description', 'title', 'authors', 'book_id']]
books = pd.merge(books, genres, on='book_id', how='left')

In [11]:
books.reset_index(drop=True, inplace=True)
interactions_csv.reset_index(drop=True, inplace=True)

In [12]:
def combine_description_and_genres(row):
    genres_str = ' '.join(row['genre_names'])
    return f"{row['description']} {genres_str}"

books['combined_text'] = books.apply(combine_description_and_genres, axis=1)

In [13]:
data = pd.merge(interactions, books, on='book_id')
data = pd.merge(data, reviews, on=['user_id', 'book_id'])

In [None]:
data.columns

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
user_reviews = data.groupby('user_id')['review_text'].apply(lambda x: ' '.join(x)).reset_index()
user_embeddings = user_reviews['review_text'].apply(lambda x: model.encode(x))

In [15]:
# Calculate user-user similarity based on review text and ratings
pca = PCA(n_components=50)
user_reviews = data.groupby('user_id')['review_text'].apply(lambda x: ' '.join(x)).reset_index()
user_embeddings = user_reviews['review_text'].apply(lambda x: model.encode(x))
user_embeddings = np.array(list(user_embeddings))
user_embeddings = PCA.transform(user_embeddings)

In [None]:
similarity_matrix = cosine_similarity(list(user_embeddings))

In [None]:
def find_similar_users(user_id, num_neighbors=5):
    user_index = user_reviews[user_reviews['user_id'] == user_id].index[0]
    similar_users = list(enumerate(similarity_matrix[user_index]))
    similar_users = sorted(similar_users, key=lambda x: x[1], reverse=True)[1:num_neighbors + 1]  # Exclude self
    similar_user_ids = [user_reviews.iloc[i[0]]['user_id'] for i in similar_users]
    return similar_user_ids

In [16]:
def recommend_books(user_id, num_recommendations=5):
    similar_user_ids = find_similar_users(user_id, num_neighbors=num_recommendations)
    recommendations = data[data['user_id'].isin(similar_user_ids)]['title'].unique()
    return recommendations[:num_recommendations]

In [None]:
recommendations = recommend_books(user_id= '8842281e1d1347389f2ab93d60773d4d')
print(recommendations)