In [5]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots 
import plotly.io as pio
from datetime import datetime
import plotly.express as px
import plotly.colors

import plotly.io as pio
from viz_utilities import (plot_rating_shifts, 

                           rating_histogram, 
                           rating_comparison, 
                           genre_barchart, 
                           format_barchart, 
                           group_nonfiction_genres,
                           group_fiction_genres, 
                           authors_by_books,
                           authors_by_pages)

color_scale = plotly.colors.sequential.Viridis
COLOR_MIN = color_scale[0]  
COLOR_MAX = color_scale[-1] 
JET_BLUE = 'rgb(0, 0, 131)'
JET_RED = 'rgb(165, 0, 0)'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# csv_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vT6eNcmziFeeUr186W7gMhlsOtCI-sJofslOPbh61gMVYOesPt9o0RVyVxWov9IAp2NrfnhiFRyd_z_/pub?gid=268068742&single=true&output=csv'
csv_url = "data/new_books.csv"
books_df = pd.read_csv(csv_url)
books_df['Finish Date'] = pd.to_datetime(books_df['Finish Date'], errors='coerce')
books_df['Page'] = pd.to_numeric(books_df['Page'], errors='coerce')
books_df['adj_gr_rating'] = 2 * books_df.gr_rating
books_df['Year'] = books_df['Finish Date'].dt.year


## Goodreads Data

In [None]:
fig = rating_comparison(books_df)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/rating_comparison.html", "w") as f:
    f.write(div_output)



In [None]:
fig = rating_histogram(books_df)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/gr_histogram.html", "w") as f:
    f.write(div_output)

In [None]:
fig = plot_rating_shifts(books_df, n=10)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/rating_differentials.html", "w") as f:
    f.write(div_output)

In [None]:
star_cols = ['gr_1', 'gr_2', 'gr_3', 'gr_4', 'gr_5']
books_df['total_reviews'] = books_df[star_cols].sum(axis=1)

for col in star_cols:
    books_df[f'{col}_pct'] = (books_df[col] / books_df['total_reviews']) * 100

# 2. Reshape for a single Plotly chart
pct_cols = [f'{c}_pct' for c in star_cols]
df_melted = books_df.melt(
    id_vars=['Title'], 
    value_vars=pct_cols, 
    var_name='Rating', 
    value_name='Percent'
)

# Clean labels for the X-axis
df_melted['Rating'] = df_melted['Rating'].str.replace('gr_', '').str.replace('_pct', '')
fig = px.violin(
    df_melted, 
    x='Rating', 
    y='Percent', 
    color='Rating',
    box=True, 
    hover_data=['Title'],
    category_orders={"Rating": ["1", "2", "3", "4", "5"]},
    title="Rating Distribution - Goodreads",
    labels={'Percent': '% of Total Reviews'},
    color_discrete_sequence=px.colors.sequential.Viridis,

    
)

fig.update_layout(showlegend=False,width=800, 
        margin=dict(l=10, r=10, t=60, b=60),)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/rating_distribution.html", "w") as f:
    f.write(div_output)

In [176]:

# Step C: Create the combined "3 or 4 star" metric
books_df['4_or_5_pct'] = books_df['gr_4_pct'] + books_df['gr_5_pct']
pct_columns = [f'{col}_pct' for col in star_cols] + ['4_or_5_pct']

# Generate the stats table
stats_table = books_df[pct_columns].describe(percentiles=[.25, .5, .75])

# Print it nicely
print(stats_table.T[['25%', '50%', '75%']])

                  25%        50%        75%
gr_1_pct     0.929367   1.395917   2.123120
gr_2_pct     2.831653   4.090410   5.930107
gr_3_pct    13.639887  17.930227  23.025897
gr_4_pct    34.218843  37.369469  40.307572
gr_5_pct    29.784543  37.277463  46.443027
4_or_5_pct  68.984759  76.663137  81.858644


## Reading Habits

### Reading Over Time

In [None]:
fig = format_barchart(books_df)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/pages_by_format.html", "w") as f:
    f.write(div_output)

In [None]:
fig = genre_barchart(books_df)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/pages_by_genre.html", "w") as f:
    f.write(div_output)

### Reading Genres


In [None]:
fig = group_nonfiction_genres(books_df, 10)
fig.show()

div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/nonfic_genre.html", "w") as f:
    f.write(div_output)

In [None]:
fig = group_fiction_genres(books_df, 10)
fig.show()
div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/fic_genre.html", "w") as f:
    f.write(div_output)

In [None]:
fig = authors_by_books(books_df)
fig.show()
div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn', config={'responsive': True})
with open("plots/authors_book_count.html", "w") as f:
    f.write(div_output)

In [None]:
fig = authors_by_pages(books_df)
fig.show()
div_output = pio.to_html(fig, full_html=False, include_plotlyjs='cdn, config={'responsive': True} config={'responsive': True} )

with open("plots/authors_by_page.html", "w") as f:
    f.write(div_output)

In [88]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=books_df['Page'],
    name='Goodreads',
    opacity=0.75,
    marker_color=COLOR_MIN,
    xbins=dict(size=25)
))



fig.update_layout(
    xaxis_title="Rating",
    title="Rating Distribution",
    width=800, 
    margin=dict(l=10, r=10, t=60, b=60),
)

fig.show()

In [94]:

# Ensure we only have rows with Genre, Page, and Rating
df_corr = books_df.dropna(subset=['Genre', 'gr_page_count', 'gr_rating']).copy()

# Standardize Genre casing and Page/Rating types
df_corr['Genre'] = df_corr['Genre'].astype(str).str.strip().str.title()
df_corr['gr_page_count'] = pd.to_numeric(df_corr['gr_page_count'], errors='coerce')
df_corr['gr_rating'] = pd.to_numeric(df_corr['gr_rating'], errors='coerce')

df_corr = df_corr[df_corr['Genre'].isin(['Fiction', 'Non-Fiction'])]

# 2. Create Faceted Scatter Plot
fig = px.scatter(
    df_corr,
    x='gr_page_count',
    y='gr_rating',
    color='gr_rating',
    facet_col='Genre',
    color_continuous_scale=[[0, JET_BLUE], [1, JET_RED]],
    trendline="ols",
    trendline_color_override="black",
    hover_name='Title',
    title='Rating vs. gr_page_count Count: Fiction vs. Non-Fiction',
    opacity=0.6
)

# 3. Refine Layout
fig.update_layout(
    width=1000,
    height=500,
    margin=dict(l=20, r=20, t=80, b=20)
)

# Calculate specific correlations for the printout
for g in ['Fiction', 'Non-Fiction']:
    subset = df_corr[df_corr['Genre'] == g]
    if len(subset) > 1:
        corr = subset['gr_page_count'].corr(subset['gr_rating'])
        print(f"{g} Correlation (Pages vs Rating): {corr:.2f}")

fig.show()

Fiction Correlation (Pages vs Rating): 0.26
Non-Fiction Correlation (Pages vs Rating): 0.17


### blah

In [182]:
# Calculate your library's success threshold
threshold = books_df['Rating'].quantile(0.75)
print(f"A 'Highly Rated' book in your library is anything above: {threshold:.2f}")

# Tag books as High Performers
books_df['is_high_performer'] = books_df['Rating'] >= threshold

A 'Highly Rated' book in your library is anything above: 8.50


In [183]:
# 1. Winning Genre
genre_stats = books_df.groupby('Genre')['Rating'].mean().sort_values(ascending=False)

# 2. Winning Page Count (using 'bins' to find the ideal length)
# We divide books into Short, Medium, Long, and Epic
books_df['length_cat'] = pd.cut(books_df['gr_page_count'], 
                          bins=[0, 200, 400, 600, 2000], 
                          labels=['Short', 'Medium', 'Long', 'Epic'])
length_stats = books_df.groupby('length_cat')['Rating'].mean()

# 3. Winning Identity (Diversity Analysis)
identity_stats = books_df[['White/Male', 'Non-White', 'Non-Male']].mean() 





In [203]:
import pandas as pd

# 1. Clean the sub-genre strings
books_df['sub_genre'] = books_df['gr_genres'].str.split(',').str[0].str.strip()

# 2. Group by BOTH Primary Genre AND Sub-Genre
# We look for the combination with the highest mean Rating
nested_stats = books_df.dropna(subset=['Rating', 'Genre', 'sub_genre']) \
                 .groupby(['Genre', 'sub_genre'])['Rating'] \
                 .agg(['mean', 'count']) \
                 .sort_values(by='mean', ascending=False)

# Filter for combinations with more than 1 book to avoid "lucky" one-offs
top_combo = nested_stats[nested_stats['count'] > 1].head(1)


print("--- THE DOUBLE-MATCH WINNER ---")
if not top_combo.empty:
    genre_win = top_combo.index[0][0]
    sub_win = top_combo.index[0][1]
    print(f"Winning Duo: {genre_win} -> {sub_win} (Avg Rating: {top_combo['mean'].values[0]:.2f})")
else:
    print("No recurring combinations found; every book is a unique genre pair.")


--- THE DOUBLE-MATCH WINNER ---
Winning Duo: Fiction -> poetry (Avg Rating: 9.25)


In [202]:
import pandas as pd
import plotly.express as px

# 1. DATA PREPARATION
# Extract the first sub-genre tag from the Goodreads list
books_df['sub_genre'] = books_df['gr_genres'].str.split(',').str[0].str.strip()

# Drop rows missing critical data for this analysis
df_clean = books_df.dropna(subset=['Rating', 'Author', 'Genre', 'sub_genre']).copy()

# 3. FICTION SUNBURST
# Filters specifically for Fiction and looks at the sub-genre hierarchy
fig_fic = px.sunburst(
    df_clean[df_clean['Genre'] == 'Fiction'], 
    path=['Genre', 'sub_genre'], 
    values='Rating', 
    color='Rating',
    color_continuous_scale='Viridis',
    title='Fiction: Sub-Genre Performance',
)
fig_fic.show()

# 4. NON-FICTION SUNBURST
# Filters specifically for Non-Fiction
fig_nonfic = px.sunburst(
    df_clean[df_clean['Genre'] == 'Non-Fiction'], 
    path=['Genre', 'sub_genre'], 
    values='Rating', 
    color='Rating',
    color_continuous_scale='Viridis',
    title='Non-Fiction: Sub-Genre Performance',
)
fig_nonfic.show()