## Bookwise Chord Diagram Data Preparation

The purpose of this code is to take the clean Bookwise data and create processed datasets to show in the Profile tab of the Streamlit Application. This involves finding connection strengths between genres based on books that people have rated.

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
os.chdir('/content/drive/MyDrive/DATASCI210/data')

In [4]:
books_df = pd.read_pickle('books_data_clean.pkl')


In [5]:
print(books_df.columns)
print(books_df.shape)


Index(['user_id', 'book_id', 'title', 'author', 'publish_year', 'description',
       'preview_link', 'normalized_popularity', 'genre_general',
       'genre_specific', 'genre_combined', 'genre_consolidated',
       'review_helpfulness', 'review_score', 'review_time', 'review_summary',
       'review_text'],
      dtype='object')
(321301, 17)


In [6]:
genre_counts = books_df["genre_consolidated"].value_counts().sort_values(ascending=False)
for genre, count in genre_counts.items():
    print(f"{genre}: {count}")


Fiction / World Literature: 32212
Fiction / Mystery & Detective: 18430
Fiction / Literary: 14741
Religion / General: 11058
Fiction / Thrillers: 10589
Fiction / General: 10422
Business & Economics / General: 10347
Biography & Autobiography / Personal Memoirs: 10258
Juvenile Fiction / General: 10148
Fiction / Sea Stories: 7650
Biography & Autobiography / General: 7431
Fiction / Science Fiction: 7064
Fiction / Fairy Tales, Folk Tales, Legends & Mythology: 6898
History / General: 5813
Fiction / Romance: 5767
Social Science / General: 5110
Computers / General: 4390
Political Science / General: 4389
Fiction / Occult & Supernatural: 4217
History / Wars & Conflicts: 4066
Body, Mind & Spirit / General: 3874
Science / General: 3755
Juvenile Nonfiction / General: 3583
Philosophy / General: 3435
Fiction / Short Stories: 3137
Psychology / General: 3091
Young Adult Fiction / General: 3016
Juvenile Fiction / Nursery Rhymes: 3000
Sports & Recreation / General: 2970
History / Social History: 2952
Ficti

In [14]:
import pandas as pd
import numpy as np
import matplotlib.colors as mcolors


def prepare_genre_metadata(books_df, top_n=40, min_segment_width=0.015, max_segment_width=0.15):
    """
    Create Table 1: Genre Metadata with organized structure for the chord diagram
    Includes genre rank by popularity and ensures minimum segment width

    Args:
        books_df: DataFrame containing book and review data
        top_n: Number of top genres to include by review count
        min_segment_width: Minimum width for any segment (prevents tiny segments)
        max_segment_width: Maximum width for any segment (prevents dominance)

    Returns:
        DataFrame with genre metadata (ID, name, category, color, position, segment width, rank)
    """

    # Count occurrences of each genre
    genre_counts = books_df["genre_consolidated"].value_counts().reset_index()
    genre_counts.columns = ["Genre Name", "Count"]

    # Add Genre Rank (1 = most popular)
    genre_counts["Genre Rank"] = genre_counts["Count"].rank(ascending=False, method='min').astype(int)

    # Extract parent category from genre
    genre_counts["Parent Category"] = genre_counts["Genre Name"].apply(
        lambda x: x.split(" / ")[0] if isinstance(x, str) and " / " in x else x
    )

    # Filter to only include the top N genres by count if specified
    if top_n:
        genre_counts = genre_counts[genre_counts["Genre Rank"] <= top_n]

    # Create genre ID (sequential starting from 1)
    genre_counts["Genre ID"] = range(1, len(genre_counts) + 1)

    # Define main categories (same as before)
    main_categories = [
        "Fiction", "Juvenile Fiction", "Young Adult Fiction",  # Fiction categories
        "Biography & Autobiography", "History", "Science",     # Main non-fiction
        "Business & Economics", "Philosophy", "Psychology",    # Academic non-fiction
        "Poetry", "Drama", "Literary Criticism",               # Literary forms
        "Art", "Music", "Photography",                         # Arts
        "Technology & Engineering", "Computers", "Reference"   # Technical
    ]

    # Handle categories not in the main list
    all_categories = genre_counts["Parent Category"].unique()
    remaining_categories = [c for c in all_categories if c not in main_categories]
    ordered_categories = main_categories + remaining_categories

    # Create a category order mapping
    category_order = {cat: i for i, cat in enumerate(ordered_categories)}

    # Assign order based on parent category and count within category
    genre_counts["Category Order"] = genre_counts["Parent Category"].map(category_order)
    genre_counts = genre_counts.sort_values(["Category Order", "Count"], ascending=[True, False])

    # Generate position values around the circle (0-360 degrees)
    total_genres = len(genre_counts)
    genre_counts["Position"] = [i * (360 / total_genres) for i in range(total_genres)]

    # Calculate segment width (proportional to count within the selected genres)
    total_count = genre_counts["Count"].sum()
    genre_counts["Segment Width"] = genre_counts["Count"] / total_count

    # Assign colors by parent category (same as before)
    color_palette = list(mcolors.TABLEAU_COLORS) + list(mcolors.CSS4_COLORS)[:20]
    unique_categories = genre_counts["Parent Category"].unique()
    category_colors = {cat: color_palette[i % len(color_palette)] for i, cat in enumerate(unique_categories)}
    genre_counts["Color Code"] = genre_counts["Parent Category"].map(category_colors)

    # Select and reorder columns for the final metadata table
    metadata = genre_counts[[
        "Genre ID", "Genre Name", "Parent Category",
        "Color Code", "Position", "Segment Width",
        "Genre Rank", "Count"  # Added Genre Rank and raw Count
    ]]

    return metadata


def create_genre_connection_matrix(books_df, genre_metadata, min_connection_threshold=5):
    """
    Create Table 2: Genre Connections Matrix showing strength of connections between genres
    Only includes connections between genres in the provided metadata

    Args:
        books_df: DataFrame containing book and review data
        genre_metadata: DataFrame with genre metadata (from prepare_genre_metadata)
        min_connection_threshold: Minimum number of users to consider a connection valid

    Returns:
        DataFrame with genre connection data
    """
    # Get the list of genres we care about (those in metadata)
    valid_genres = set(genre_metadata["Genre Name"])

    # Create mapping from genre name to ID
    genre_id_map = dict(zip(genre_metadata["Genre Name"], genre_metadata["Genre ID"]))

    # Filter the books to only include those with genres we care about
    books_filtered = books_df[books_df["genre_consolidated"].isin(valid_genres)]

    # Group by user_id to find users' genre reading patterns
    user_genres = books_filtered.groupby("user_id")["genre_consolidated"].apply(set).reset_index()

    # Initialize a connection matrix
    genres = list(genre_id_map.keys())
    connection_matrix = pd.DataFrame(0, index=genres, columns=genres)

    # For each user, increment the count between each pair of genres they've read
    for _, row in user_genres.iterrows():
        user_genre_list = list(row["genre_consolidated"])
        if len(user_genre_list) > 1:  # User must have read at least 2 genres
            for i in range(len(user_genre_list)):
                for j in range(i+1, len(user_genre_list)):
                    g1, g2 = user_genre_list[i], user_genre_list[j]
                    if pd.notna(g1) and pd.notna(g2):
                        connection_matrix.loc[g1, g2] += 1
                        connection_matrix.loc[g2, g1] += 1  # Matrix is symmetric

    # Convert the matrix to a long format for the chord diagram
    connections = []

    for g1 in genres:
        for g2 in genres:
            if g1 < g2:  # To avoid duplicates (matrix is symmetric)
                count = connection_matrix.loc[g1, g2]
                if count >= min_connection_threshold:  # Apply threshold
                    connections.append({
                        "Source Genre ID": genre_id_map[g1],
                        "Target Genre ID": genre_id_map[g2],
                        "User Count": int(count),
                    })

    connections_df = pd.DataFrame(connections)

    # Calculate connection strength metrics
    if not connections_df.empty:
        max_count = connections_df["User Count"].max()
        connections_df["Connection Strength"] = connections_df["User Count"] / max_count
        connections_df["Normalized Strength"] = (connections_df["User Count"] - connections_df["User Count"].min()) / \
                                           (connections_df["User Count"].max() - connections_df["User Count"].min() + 1e-10)

        # Mark top connections (top 20%)
        threshold = connections_df["Connection Strength"].quantile(0.8)
        connections_df["Is Top Connection"] = connections_df["Connection Strength"] >= threshold

    return connections_df


def process_data_for_chord_diagram(books_df, top_n=40):
    """
    Main function to process the data for chord diagram visualization

    Args:
        books_df: DataFrame containing book and review data
        top_n: Number of top genres to include

    Returns:
        tuple of (genre_metadata, genre_connections)
    """
    # Filter out rows with missing genre information
    filtered_df = books_df.dropna(subset=["genre_consolidated"])

    # Step 1: Prepare genre metadata for top N genres
    genre_metadata = prepare_genre_metadata(filtered_df, top_n=top_n)

    # Step 2: Create genre connection matrix based on these top genres
    genre_connections = create_genre_connection_matrix(filtered_df, genre_metadata)

    return genre_metadata, genre_connections


In [15]:
genre_metadata, genre_connections = process_data_for_chord_diagram(books_df)

In [22]:
genre_connections.head()

Unnamed: 0,Source Genre ID,Target Genre ID,User Count,Connection Strength,Normalized Strength,Is Top Connection
0,1,9,2987,0.589152,0.579532,True
1,1,28,1106,0.218146,0.199839,True
2,1,40,1046,0.206312,0.187727,True
3,1,27,1703,0.335897,0.320347,True
4,1,14,2019,0.398225,0.384134,True


In [17]:
print(genre_metadata.shape)
print(genre_connections.shape)

(40, 8)
(780, 6)


In [18]:
genre_metadata.head(50)

Unnamed: 0,Genre ID,Genre Name,Parent Category,Color Code,Position,Segment Width,Genre Rank,Count
0,1,Fiction / World Literature,Fiction,tab:blue,0.0,0.130407,1,32212
1,2,Fiction / Mystery & Detective,Fiction,tab:blue,9.0,0.074612,2,18430
2,3,Fiction / Literary,Fiction,tab:blue,18.0,0.059678,3,14741
4,5,Fiction / Thrillers,Fiction,tab:blue,27.0,0.042869,5,10589
5,6,Fiction / General,Fiction,tab:blue,36.0,0.042192,6,10422
9,10,Fiction / Sea Stories,Fiction,tab:blue,45.0,0.03097,10,7650
11,12,Fiction / Science Fiction,Fiction,tab:blue,54.0,0.028598,12,7064
12,13,"Fiction / Fairy Tales, Folk Tales, Legends & M...",Fiction,tab:blue,63.0,0.027926,13,6898
14,15,Fiction / Romance,Fiction,tab:blue,72.0,0.023347,15,5767
18,19,Fiction / Occult & Supernatural,Fiction,tab:blue,81.0,0.017072,19,4217


In [23]:
genre_metadata.to_csv("genre_metadata.csv", index=False)
genre_connections.to_csv("genre_connections.csv", index=False)

genre_metadata.to_pickle("genre_metadata.pkl")
genre_connections.to_pickle("genre_connections.pkl")
