In [None]:
# import libraries
from pyvis.network import Network  # Interactive network visualization
import pandas as pd # Data Frame
import networkx as nx # Network analysis
from tqdm.notebook import tqdm # Progress bar
import webbrowser


In [None]:
# Read data
ratings_df = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1')
users_df = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1')
books_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

In [None]:
books_df.columns = books_df.columns.str.lower()
books_df.columns = books_df.columns.str.replace('-','_')
books_df

In [None]:
users_df.columns = users_df.columns.str.lower()
users_df.columns = users_df.columns.str.replace('-','_')
users_df

In [None]:
ratings_df.columns = ratings_df.columns.str.lower()
ratings_df.columns = ratings_df.columns.str.replace('-','_')
ratings_df

In [None]:
user_counts = ratings_df['user_id'].value_counts()

# Identify user_ids with more than 5000 occurrences
high_frequency_users = user_counts[user_counts > 50000].index
filtered_df = ratings_df.copy(deep=True)
# Filter out these user_ids from df
# filtered_df = filtered_df[~filtered_df['user_id'].isin(high_frequency_users)]
# filtered_df = filtered_df[filtered_df['book_rating'] > 8]

In [None]:
df = filtered_df

In [None]:
def getColor(num):
    selection = num % 4
    table = {
        0:"#611C35",
        1:"#2E5077",
        2:"#FFA630",
        3:"#FFD700",
    }
    return table[selection]

def getSize(num):
    selection = num % 4
    table = {
        0:600,
        1:400,
        2:200,
        3:100,
    }
    return table[selection]

In [None]:
def processing(items:set,next_layer_df:pd.DataFrame,current_layer_column_name:str, next_layer_column_name:str,layer_count:int,G:nx.Graph,next_item_df:pd.DataFrame,top_n:int=5):
    temp_set = set()
    if layer_count == 3:
        occurrences = {}
        books = next_layer_df[next_layer_df['user_id'].isin(items)]
        for book in books['isbn']:
            if book in occurrences:
                occurrences[book] += 1
            else:
                occurrences[book] = 1
        sorted_occurrences = sorted(occurrences.items(), key=lambda x: x[1], reverse=True)
        top_5_books = [x[0] for x in sorted_occurrences[:top_n]]
        for book in top_5_books:
            f=open('list.txt', 'a')
            f.write(f"{book}:{occurrences[book]}\n")
            f.close()
    for item in tqdm(items, desc=f"Processing current item, layer {layer_count}"):
        # next_layer_items = next_layer_df[next_layer_df[current_layer_column_name] == item][next_layer_column_name].unique()
        sorted_items = next_layer_df.sort_values(by='book_rating', ascending=False)
        filtered_sorted_items = sorted_items[sorted_items[current_layer_column_name] == item]
        # Get unique values and pick top 5
        top_5_next_layer_items = filtered_sorted_items[next_layer_column_name].unique()[:top_n]
        for next_layer_item in top_5_next_layer_items:
            temp_set.add(next_layer_item)
            try:
                next_item = next_item_df[next_item_df[next_layer_column_name]==next_layer_item].iloc[0]
                base_name = f"{layer_count}_{next_layer_column_name}:{next_item[next_layer_column_name]}"
                title = ''
                if(next_layer_column_name=='isbn'):
                    title = f"layer:{layer_count}\nisbn:{next_item['isbn']}\nbook_title:{next_item['book_title']}\nbook_author:{next_item['book_author']}"
                else:
                    title = f"layer:{layer_count}\nuser_id:{next_item['user_id']}\nlocation:{next_item['location']}\nage:{next_item['age']}"
                G.add_node(base_name,title=title, color=getColor(layer_count), size=getSize(layer_count))
                G.add_edge(f"{layer_count-1}_{current_layer_column_name}:{item}",base_name)
            except IndexError:
                print(f"IndexError: {next_layer_item}")
                temp_set.discard(next_layer_item)
    return temp_set

In [None]:
def buildLayers(_target_user_id, _layer_count, _df, _G:nx.Graph,top_n:int=5):
    root_user = users_df[users_df['user_id']==_target_user_id].iloc[0]
    title = f"0_user_id:{root_user['user_id']}\nlocation:{root_user['location']}\nage:{root_user['age']}"
    _G.add_node(f"0_user_id:{_target_user_id}", label=str(_target_user_id), color="#611C35",size = getSize(0),title=title)
    # Building the tree
    current_layer_users = set()
    current_layer_users.add(_target_user_id)
    for layer in tqdm(range(1, _layer_count + 1), desc="Building the layer"):
        if layer % 2 == 1:  # Odd layers: find books
            current_layer_users = processing(current_layer_users, _df, 'user_id', 'isbn', layer, _G, books_df,top_n)
        else:  # Even layers: find users
            current_layer_users = processing(current_layer_users, _df, 'isbn', 'user_id', layer, _G, users_df,top_n)

In [None]:
# def buildLayers(_target_user_id, _layer_count, _df, _G:nx.Graph):
#     _G.add_node(f"0_{_target_user_id}", label=str(_target_user_id), color="#611C35",size = getSize(0),title=f"User ID: {_target_user_id}")
#     # Building the tree
#     current_layer_users = {_target_user_id}
#     for layer in tqdm(range(1, _layer_count + 1), desc="Building the layer"):
#         if layer % 2 == 1:  # Odd layers: find books
#             books = _df[_df['user_id'].isin(current_layer_users)]['isbn'].unique()
#             for book in tqdm(books, desc=f"Building the book layer {layer}"):
#                 try:
#                     book_row = books_df[books_df['isbn'] == book].iloc[0]
#                     book_title = f"title:{book_row['book_title']};\nisbn:{book_row['isbn']}"
#                     _G.add_node(f"{layer}_{book}", color=getColor(layer),size = getSize(layer), title=book_title)
#                     for user in current_layer_users:
#                     # Check if the user has rated this book
#                         if not _df[(_df['user_id'] == user) & (_df['isbn'] == book)].empty:
#                             _G.add_edge(f"{layer-1}_{user}", f"{layer}_{book}", color=getColor(layer))
#                 except IndexError:
#                     books.discard(book)
#             current_layer_users = set(books)
#         else:  # Even layers: find users
#             users = _df[_df['isbn'].isin(current_layer_users)]['user_id'].unique()
#             for user in tqdm(users, desc=f"Building the user layer {layer}"):
#                 try:
#                     user_title = f"{users_df[users_df['user_id'] == user].iloc[0]['user_id']}"
#                     _G.add_node(f"{layer}_{user}", color=getColor(layer),size = getSize(layer), title=f"User ID: {user_title}")
#                     for book in current_layer_users:
#                         # Check if the user has rated this book
#                         if not _df[(_df['user_id'] == user) & (_df['isbn'] == book)].empty:
#                             _G.add_edge(f"{layer-1}_{book}", f"{layer}_{user}", color=getColor(layer))
#                 except IndexError:
#                     users.discard(user)
                
#             current_layer_users = set(users)

In [None]:
# Initialize Pyvis Network
net = Network(notebook=True,select_menu=True)
# net.show_buttons(filter_=['physics'])
net.barnes_hut()
G = nx.Graph()

# Initialize variables
target_user_id = 236198
layer_count = 3
top_n=10000

buildLayers(target_user_id,layer_count,df,G,top_n=top_n)
print(f"Number of nodes: {len(G.nodes)}")
print(f"Number of edges: {len(G.edges)}")
net.from_nx(G)
# Display the network
filename = f'user_book_network_{target_user_id}.html'
net.write_html(filename)
file = open(filename,'r')
content = file.read()
content = content.replace('id="mynetwork"', 'id="mynetwork" style="height: 100dvh;"')
file.close()
file = open(filename,'w')
file.write(content)
file.close()
webbrowser.open(f'{filename}')