In [1]:
import numpy as np
import pandas as pd
import altair as alt
from altair import datum
import ast
from datetime import datetime


# Handle large data sets without embedding them in the notebook
alt.data_transformers.enable('data_server')
# # Include an image for each plot since Gradescope only supports displaying plots as images
# alt.renderers.enable('mimetype')

DataTransformerRegistry.enable('data_server')

In [2]:
url = "https://github.com/kemiolamudzengi/dsci-320-datasets/blob/main/amazon_conlit_goodreads_nyt.csv?raw=true"
books = pd.read_csv(url, parse_dates=['amazon_year', 'conlit_pubdate', 'nyt_published_date',
                                      'goodreads_publish_date', 'goodreads_first_publish_date'
                                     ] )

## Task 2

In [3]:
books['conlit_pubyear'] = books['conlit_pubdate'].dt.year

dropdown_options = sorted(
    books['conlit_pubyear'].dropna().unique(), 
    reverse=True
)

year_selector = alt.binding_select(
    options=dropdown_options,
    name = 'Year: '
)
year_dropdown = alt.selection_single(
    fields=['conlit_pubyear'], 
    bind=year_selector
)
## line chart over time

genre_line = alt.Chart(books.dropna(subset=['conlit_genre'])).mark_line().encode(
    alt.X('year(conlit_pubdate):O'),
    alt.Y('count()'),
    alt.Color('conlit_genre')
).properties(
    width = 600,
    height = 250
)
# bar chart
genre_bar = alt.Chart(books.dropna(subset=['conlit_genre'])).transform_filter(
    year_dropdown
).mark_bar().encode(
    alt.X('count()', axis=alt.Axis(tickMinStep=1)),
    alt.Y('conlit_genre', sort='-x')
).add_selection(
    year_dropdown
).properties(
    width=600,
    height=250
)

In [4]:
genre_line & genre_bar

## Task 6

In [26]:
rating = ['amazon_rating', 'goodreads_rating']
bar = alt.Chart(books).mark_bar().encode(
    alt.Y('count()'),
    alt.X(alt.repeat("repeat"), type ='quantitative', bin=alt.BinParams(maxbins=30))
).properties(
    height=200,
    title ="Number of books per rating",
).repeat(
    repeat=rating,
    columns =1
)

scatter = alt.Chart(books).mark_circle().encode(
    alt.X('amazon_num_reviews:Q', axis=alt.Axis(title="Amazon Reviews")),
    alt.Y('goodreads_num_ratings:Q', axis=alt.Axis(title="Goodreads Reviews"))
).properties(
    title ="Goodreads Reviews vs Amazon Reviews",
    height = 350,
    width =400
)

brush = alt.selection_interval(
    resolve='global'
)
mini_dashboard = alt.hconcat(scatter.add_selection(brush), bar.transform_filter(brush))
mini_dashboard

## Task 3: Heatmap - What are the top book genres for the top 5 publishers with the highest occurrences? 

## Task 5: Points - How do the prices differ between Goodreads and Amazon platforms per book listed in XYZ genre(s)?

In [6]:
column_subset = ['amazon_price', 'goodreads_price']

# retain rows only if both amazon_price and goodreads_price are there, otherwise drop all NaN rows
books_prices = books.dropna(subset=column_subset, how='any')
# sort the books by title and amazon_prices 
books_prices = books_prices.sort_values(by=["title", "amazon_price"], ascending=False)
books_prices = books_prices.drop_duplicates(subset=["title"])
# books_prices.info()

amzn_chart = alt.Chart(books_prices).mark_point().encode(
    alt.X("title"),
    alt.Y("amazon_price"),
    color = alt.value('#ff9900')
)

gr_chart = alt.Chart(books_prices).mark_point().encode(
    alt.X("title"),
    alt.Y("goodreads_price"),
    color = alt.value('#75420e')
)

amzn_chart + gr_chart

In [7]:
books_prices.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 172 entries, 113 to 0
Data columns (total 28 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   title                         172 non-null    object        
 1   amazon_author                 172 non-null    object        
 2   amazon_rating                 172 non-null    float64       
 3   amazon_num_reviews            172 non-null    int64         
 4   amazon_price                  172 non-null    int64         
 5   amazon_year                   172 non-null    datetime64[ns]
 6   amazon_genre                  172 non-null    object        
 7   conlit_genre                  62 non-null     object        
 8   conlit_pubdate                62 non-null     datetime64[ns]
 9   conlit_author_gender          62 non-null     object        
 10  conlit_author_nationality     32 non-null     object        
 11  conlit_total_ratings          62

In [8]:
gr_chart

## Task 4: Multi-view (Bar chart + Scatter) - Are fiction books more popular than non-fiction books and do they stay on the best-selling list longer?

In [39]:
books["total_star_amazon"] = books["amazon_rating"] * books["amazon_num_reviews"]
books["total_star_goodreads"] = books["goodreads_rating"] * books["goodreads_num_ratings"]

brush = alt.selection_interval(
    encodings=['x'] 
)

click = alt.selection_multi(fields = ['amazon_genre'], bind='legend')


genre_week = alt.Chart(books).mark_bar(
    opacity=0.6,
    binSpacing=0
).add_selection(
    brush,
    click
).encode(
    alt.X('nyt_weeks_on_list:Q',  bin=alt.BinParams(maxbins=30), axis=alt.Axis(title="Weeks on NYT")),
    alt.Y('count()', stack=None),
    alt.Color('amazon_genre'),
    opacity=alt.condition(click, alt.value(0.75), alt.value(0.1))
).properties(
    title = "Number of weeks fiction & non-fiction books on best-selling list"
)

genre_pop = alt.Chart(books).mark_circle(size = 30 ,opacity = 0.5).add_selection(
    click
).encode(
    alt.X('total_star_goodreads:Q', axis=alt.Axis(title="Total stars Goodreads")),
    alt.Y('total_star_amazon:Q', axis=alt.Axis(title="Total stars Amazon")),
    alt.Color('amazon_genre'),
    opacity=alt.condition(brush, alt.value(0.90), alt.value(0.03)),
    #opacity=alt.condition(click, alt.value(0.75), alt.value(0.15))
).properties(
    title = "Popularity of fiction & non-fiction books on Amazon vs Goodreads"
)

alt.hconcat(genre_week, genre_pop).properties(spacing=5)