# Updated LuxLab Summer Sample Code — Bokeh Plots, Time Series, and Merge

In [1]:
import pandas as pd

In [2]:
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import ColumnDataSource, NumeralTickFormatter
from bokeh.io import output_notebook, show

In [3]:
output_notebook()

In [4]:
goodreads_demo = pd.read_csv('Goodreads-Classics-Demographic-Info.csv')

In [5]:
goodreads_demo.head()

Unnamed: 0,title,author,original_language,gender,race/ethnicity,publication_date,author_nationality,AmazonClassics_edition,AmazonClassics_url,AmazonClassics_date
0,The Odyssey,Homer,Greek,M,White,700,Greek,yes,https://www.amazon.com/Odyssey-AmazonClassics-...,6/27/17
1,The Iliad,Homer,Greek,M,White,750,Greek,yes,https://www.amazon.com/Iliad-AmazonClassics-Ho...,7/25/17
2,Beowulf,Unknown,Old English,M,White,975,Unknown,yes,https://www.amazon.com/Beowulf-AmazonClassics-...,6/27/17
3,The Canterbury Tales,Geoffrey Chaucer,English,M,White,1390,British,no,,
4,The Prince,Niccolò Machiavelli,Italian,M,White,1513,Italian,no,,


# Bokeh Plots

I found this helpful Bokeh tutorial: https://programminghistorian.org/en/lessons/visualizing-with-bokeh

## Basic Plot

So I learned that you can put a Pandas dataframe straight into `ColumnDataSource`. Then you can simply plot by the columns in the dataframe, e.g. `x='yearFirstPublished'` or `y='numRatings'`. There are cases when it might be helpful to do more `groupby()`s and more manipulation, like I showed you guys earlier, but using the whole dataframe is one super easy route.

### Add Dataframe to ColumnDataSource

In [8]:
source = ColumnDataSource(goodreads_demo)

In [12]:
bokeh_plot = figure( title="The Goodreads 'Classics'", x_axis_label = 'Publication Date',
                    y_axis_label ='Number of Goodreads Ratings',
           x_range=[1500, 2050])

bokeh_plot.circle(x='publication_date', 
         size = 30,
         source=source,
         color='purple')

#Format big numbers wiht abbreviations
bokeh_plot.yaxis.formatter = NumeralTickFormatter(format='0.0a')

show(bokeh_plot) 

## Add Colors From Bokeh Color Palette

https://docs.bokeh.org/en/latest/docs/user_guide/categorical.html#colors

## Add Custom Colors

In [18]:
goodreads_demo.head()

Unnamed: 0,title,author,original_language,gender,race/ethnicity,publication_date,author_nationality,AmazonClassics_edition,AmazonClassics_url,AmazonClassics_date
0,The Odyssey,Homer,Greek,M,White,700,Greek,yes,https://www.amazon.com/Odyssey-AmazonClassics-...,6/27/17
1,The Iliad,Homer,Greek,M,White,750,Greek,yes,https://www.amazon.com/Iliad-AmazonClassics-Ho...,7/25/17
2,Beowulf,Unknown,Old English,M,White,975,Unknown,yes,https://www.amazon.com/Beowulf-AmazonClassics-...,6/27/17
3,The Canterbury Tales,Geoffrey Chaucer,English,M,White,1390,British,no,,
4,The Prince,Niccolò Machiavelli,Italian,M,White,1513,Italian,no,,


In [19]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Spectral3

TOOLTIPS = [
    ("Title", "@title"),
    ("Author", "@author"),
    ("Publication Date", "@publication_date"),
    ("AmazonClassics Edition", "@AmazonClassics_edition")
    
]

bokeh_plot = figure( title="The Goodreads 'Classics'", x_axis_label = 'Publication Date', y_axis_label ='Number of Goodreads Ratings',
           x_range=[1500, 2050],
               tooltips=TOOLTIPS)

colors = ['black', 'pink', 'purple']

#Add color based on column "whichListCategory"
color_map = factor_cmap(field_name='AmazonClassics_edition',
                    palette=colors, factors=['yes', 'no'])

bokeh_plot.circle(x='publication_date',
                
         size = 30,
         source=source,
         color=color_map)

#Format big numbers wiht abbreviations
bokeh_plot.yaxis.formatter = NumeralTickFormatter(format='0.0a')

#output_file('Goodreads-Classics.html')

show(bokeh_plot) 

# Pandas Merge

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

Let's merge the Goodreads rating/review csv file with the Goodreads demographic csv

In [24]:
goodreads_df = pd.read_csv('Goodreads-Classics-Appendix.csv')

In [25]:
demographic_df = pd.read_csv('Goodreads-Classics-Demographic-Info.csv')

In [26]:
demographic_df.sample(2)

Unnamed: 0,title,author,original_language,gender,race/ethnicity,publication_date,author_nationality,AmazonClassics_edition,AmazonClassics_url,AmazonClassics_date
3,The Canterbury Tales,Geoffrey Chaucer,English,M,White,1390,British,no,,
138,Like Water for Chocolate,Laura Esquivel,Spanish,W,Mexican,1989,Mexican,no,,


In [27]:
demographic_df.shape

(143, 10)

In [28]:
goodreads_df.sample(2)

Unnamed: 0,author,title,yearFirstPublished,mostReadRank,mostPopularRank,numRatings,numRatingsAbbrev,numReviews,numReviewsAbbrev,whichListCategory,AP_English_Recommended,Open_Syllabus_200_Text,Open_Syllabus_200_Author
91,John Steinbeck,The Grapes of Wrath,1939,92,39,676785,677k,15411,15k,Most Shelved and Most Read,,,
10,Miguel de Cervantes Saavedra,Don Quixote,1605,FALSE,69,187407,187k,7041,7k,Most Shelved,,Top College Text,Top College Author


In [29]:
goodreads_df.shape

(145, 13)

If you don't tell pandas which column to merege on, it will guess based on column names and shared columns

In [34]:
merged_df = demographic_df.merge(goodreads_df)

In [35]:
source = ColumnDataSource(merged_df)

In [39]:
merged_df.columns

Index(['title', 'author', 'original_language', 'gender', 'race/ethnicity',
       'publication_date', 'author_nationality', 'AmazonClassics_edition',
       'AmazonClassics_url', 'AmazonClassics_date', 'yearFirstPublished',
       'mostReadRank', 'mostPopularRank', 'numRatings', 'numRatingsAbbrev',
       'numReviews', 'numReviewsAbbrev', 'whichListCategory',
       'AP_English_Recommended', 'Open_Syllabus_200_Text',
       'Open_Syllabus_200_Author'],
      dtype='object')

In [45]:
nationalities = merged_df['author_nationality'].unique()

In [47]:
len(nationalities)

23

In [55]:
import random

In [64]:
color_palette = random.choices(Turbo256, k=23)

In [65]:
from bokeh.transform import factor_cmap
from bokeh.palettes import Turbo256

TOOLTIPS = [
    ("Title", "@title"),
    ("Author", "@author"),
    ("Publication Date", "@publication_date"),
    ("AmazonClassics Edition", "@AmazonClassics_edition"),
    ("Nationality", "@author_nationality")
    
]

bokeh_plot = figure( title="The Goodreads 'Classics'", x_axis_label = 'Publication Date', y_axis_label ='Number of Goodreads Ratings',
           x_range=[1500, 2050],
               tooltips=TOOLTIPS)

#Add color based on column "whichListCategory"
color_map = factor_cmap(field_name='author_nationality',
                    palette=color_palette, factors=nationalities)

bokeh_plot.circle(x='publication_date',
        y='numRatings',
         size = 30,
         source=source,
         color=color_map)

#Format big numbers wiht abbreviations
bokeh_plot.yaxis.formatter = NumeralTickFormatter(format='0.0a')

#output_file('Goodreads-Classics.html')

show(bokeh_plot) 