In [200]:
# import libraries
from pyvis.network import Network  # Interactive network visualization
import pandas as pd # Data Frame
import networkx as nx # Network analysis
from tqdm.notebook import tqdm # Progress bar
from itertools import combinations
import matplotlib.colors as mcolors
import community as community_louvain



In [201]:
# Read data
ratings_df = pd.read_csv('data/BX-Book-Ratings.csv', sep=';', encoding='latin-1')
users_df = pd.read_csv('data/BX-Users.csv', sep=';', encoding='latin-1')
books_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

  books_df = pd.read_csv('data/BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [202]:
books_df.columns = books_df.columns.str.lower()
books_df.columns = books_df.columns.str.replace('-','_')
books_df

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,image_url_s,image_url_m,image_url_l
0,0195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,0002005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,0060973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,0393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...
...,...,...,...,...,...,...,...,...
271355,0440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm),http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...,http://images.amazon.com/images/P/0440400988.0...
271356,0525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...,http://images.amazon.com/images/P/0525447644.0...
271357,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...,http://images.amazon.com/images/P/006008667X.0...
271358,0192126040,Republic (World's Classics),Plato,1996,Oxford University Press,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...,http://images.amazon.com/images/P/0192126040.0...


In [203]:
users_df.columns = users_df.columns.str.lower()
users_df.columns = users_df.columns.str.replace('-','_')
users_df

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [204]:
ratings_df.columns = ratings_df.columns.str.lower()
ratings_df.columns = ratings_df.columns.str.replace('-','_')
ratings_df

Unnamed: 0,user_id,isbn,book_rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [205]:
# Use smaller sample of the data
sample_fraction = 0.005
df_filtered = ratings_df.sample(frac=sample_fraction, random_state=42)

#remove records with only one rating
counts = df_filtered.groupby('user_id')['user_id'].transform('count')
df_filtered = df_filtered[counts > 1]
df_filtered

Unnamed: 0,user_id,isbn,book_rating
178554,38781,0373259131,0
533905,128835,0811805905,8
1091374,261829,037324486X,0
785993,189973,0515129283,0
405917,98391,006000181X,8
...,...,...,...
396114,95359,0440940001,9
246532,56447,0140386645,0
1039363,248718,0140443606,9
945070,228998,0399900438,0


In [206]:
# Step 0: Define a function to convert ratings to distances
def rating_to_distance(rating):
    temp = 0
    if type(rating) == int or type(rating) == float:
        temp = rating
    else:
        temp = int(rating)
    # Mapping for the 0 to 10 scale to the specified distances
    rating_distance_map = {
        0: -16,
        1: -8,
        2: -4,
        3: -2,
        4: -1,
        5: 0,
        6: 1,
        7: 2,
        8: 4,
        9: 8,
        10: 16
    }
    return rating_distance_map.get(temp, 0)  # Default to 0 if rating is not in the map


In [207]:
# Step 1: Sum ratings for each pair of books rated by the same user
pairwise_sums = {}
with tqdm(total=len(df_filtered.groupby('user_id'))) as pbar:
    for user_id, group in df_filtered.groupby('user_id'):
        for combo in combinations(group.values, 2):
            book1, rating1 = combo[0][1], combo[0][2]
            book2, rating2 = combo[1][1], combo[1][2]
            pair = tuple(sorted([book1, book2]))
            distance = rating_to_distance(rating1) + rating_to_distance(rating2)
            pairwise_sums.setdefault(pair, []).append(distance)
        pbar.update(1)

  0%|          | 0/827 [00:00<?, ?it/s]

In [208]:
# Step 2: Calculate mean distance for each pair of books
pairwise_distances = {pair: sum(sums) / len(sums) for pair, sums in pairwise_sums.items()}


In [209]:
def distance_to_color(distance, min_distance=-32, max_distance=32):
    # Normalize the distance to a range between 0 and 1
    norm = (distance - min_distance) / (max_distance - min_distance)
    
    # Define colors for the gradient (red, white, green)
    colors = ["red", "white", "green"]
    cmap = mcolors.LinearSegmentedColormap.from_list("distance_cmap", colors)
    
    # Get the color corresponding to the normalized distance
    return mcolors.to_hex(cmap(norm))

In [210]:
G = nx.Graph() 

# Add nodes (books)
for book in tqdm(set(df_filtered['isbn']), desc='Adding nodes'):
    try:
        book_title = books_df[books_df['isbn'] == book].iloc[0]['book_title']
    except IndexError:
        book_title = 'Unknown'
    G.add_node(book, label=f'Book {book}', title=f'{book}:{book_title}', size=10)

Adding nodes:   0%|          | 0/2975 [00:00<?, ?it/s]

In [211]:
# Add edges (calculated distances)
for (book1, book2), distance in tqdm(pairwise_distances.items(), desc='Adding edges'):
    if(distance>0):
        color = distance_to_color(distance)
        G.nodes.get(book1)['size'] +=10
        G.nodes.get(book2)['size'] +=10
        G.add_edge(book1, book2, value=distance,title=f'{book1} - {book2}: {distance}',color=color)

Adding edges:   0%|          | 0/11094 [00:00<?, ?it/s]

In [212]:
nodes_to_keep = [node for node, attr in G.nodes(data=True) if attr.get('size') != 10]
G = G.subgraph(nodes_to_keep)

In [213]:
partition = community_louvain.best_partition(G)
clusters = {}
for node, comm_id in partition.items():
    clusters.setdefault(comm_id, []).append(node)

In [214]:
net = Network(notebook=True,select_menu=True)
net.barnes_hut()
net.from_nx(G)



In [215]:
# def transform_cluster_to_linear_loop(pyvis_network, cluster_nodes):
#     # Clear existing edges within the cluster in the Pyvis network
#     pyvis_network.edges = [e for e in pyvis_network.edges if not (e['from'] in cluster_nodes and e['to'] in cluster_nodes)]

#     # Add new edges to form a linear loop in the cluster
#     for i in range(len(cluster_nodes)):
#         next_node = cluster_nodes[(i + 1) % len(cluster_nodes)]
#         pyvis_network.add_edge(cluster_nodes[i], next_node, value=len(cluster_nodes), color='black')

# # Assuming 'clusters' is a dictionary with cluster IDs as keys and lists of node IDs as values
# for cluster_nodes in clusters.values():
#     transform_cluster_to_linear_loop(net, cluster_nodes)  # 'net' should be your Pyvis network

In [216]:
# Generate network graph
net.write_html('book_network.html',open_browser=True)