In [95]:
from bokeh.io import show, output_notebook
output_notebook()

In [92]:
import pandas as pd
from bokeh.io import show
from bokeh.models import FactorRange, ColumnDataSource, FactorRange, BoxAnnotation, Whisker
from bokeh.plotting import figure
from bokeh.transform import dodge, factor_cmap
from bokeh.palettes import Spectral6
from bokeh.layouts import column
import numpy as np

def calculate_box_data(df, by_group):

    # Group by by_group and calculate Q1, Q2 and Q3
    grouped = df.groupby(by_group)
    q1 = grouped.quantile(q=0.25)
    q2 = grouped.quantile(q=0.5)
    q3 = grouped.quantile(q=0.75)
    iqr = q3 - q1
    upper_bound = q3 + 1.5*iqr
    lower_bound = q1 - 1.5*iqr

    # Concatenate data into a single DataFrame
    df = pd.concat([q1, q2, q3, upper_bound, lower_bound], axis=1)
    df.columns = ['q1', 'q2', 'q3', 'upper', 'lower']
    return df.reset_index()

# Load data
df = pd.read_csv("data/movies.csv")

# Drop NAs
df = df.dropna(subset=["Major Genre", "MPAA Rating", "Production Budget"])

# Calculate box data
box_data = calculate_box_data(df, by_group=["Major Genre", "MPAA Rating"])

# List of all factors for x-axis
factors = list(df['Major Genre'].unique())

# Use ColumnDataSource to pass in data for plotting
source = ColumnDataSource(box_data)

# Create a figure with FactorRange as x_range
p = figure(x_range=FactorRange(factors=factors), tools="")

# Create boxes for each group
for i, rating in enumerate(df["MPAA Rating"].unique()):
    boxes = BoxAnnotation(top='q3', bottom='q1', left=dodge('Major Genre', -0.2+i*0.2, range=p.x_range), right=dodge('Major Genre', 0+i*0.2, range=p.x_range), fill_color=Spectral6[i], fill_alpha=0.5)
    p.add_layout(boxes)
    p.rect(x=dodge('Major Genre', -0.1+i*0.2, range=p.x_range), y='q2', height='q2', width=0.2, angle=0, color='black', source=source)
    upper = Whisker(base=dodge('Major Genre', -0.1+i*0.2, range=p.x_range), upper='upper', lower='q3', line_color='black', source=source)
    lower = Whisker(base=dodge('Major Genre', -0.1+i*0.2, range=p.x_range), upper='q1', lower='lower', line_color='black', source=source)
    p.add_layout(upper)
    p.add_layout(lower)

p.y_range.start = 0

show(p)

  q1 = grouped.quantile(q=0.25)
  q2 = grouped.quantile(q=0.5)
  q3 = grouped.quantile(q=0.75)


ValueError: Length mismatch: Expected axis has 40 elements, new values have 5 elements

In [87]:
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.transform import linear_cmap
from bokeh.palettes import Viridis256

df = pd.read_csv("data/movies.csv", parse_dates=["Release Date"])
df["Release Date"] = df["Release Date"].dt.year
df2 = df.groupby("Release Date")["Worldwide Gross"].sum().reset_index()

df2['y'] = 1
source = ColumnDataSource(df2)

p = figure(title='Yearly Worldwide Gross', x_axis_label='Release Date', y_axis_label='y')

p.vbar(x='Release Date', top='y', 
       width=0.5, 
       color=linear_cmap('Worldwide Gross', Viridis256, df2['Worldwide Gross'].min(), df2['Worldwide Gross'].max()), 
       source=source)

show(p)

In [11]:
import pandas as pd
from bokeh.plotting import figure

df = pd.read_csv("data/movies.csv")

df["Major Genre"] = df["Major Genre"].fillna("Unknown")

grouped = df.groupby("Major Genre")["Production Budget"]
categories = list(grouped.groups.keys())

q1 = grouped.quantile(q=0.25)
q2 = grouped.quantile(q=0.5)
q3 = grouped.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

p = figure(x_range=df["Major Genre"].unique())
p.segment(categories, upper, categories, q3, line_color="black")
p.segment(categories, lower, categories, q1, line_color="black")
p.vbar(x=categories, width=0.7, top=q3, bottom=q2, line_color="black")
p.vbar(x=categories, width=0.7, top=q2, bottom=q1, line_color="black")


output_notebook()
show(p)


In [93]:
import pandas as pd
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.layouts import gridplot
from bokeh.palettes import Category10

df = pd.read_csv("data/movies.csv")

group_column = "MPAA Rating"
facet_column = "Major Genre"
value_column = "Production Budget"

df[group_column] = df[group_column].fillna("Unknown")
df[facet_column] = df[facet_column].fillna("Unknown")
y_min = df[value_column].min()
y_max = df[value_column].max()

group_values = df[group_column].unique()
facet_values = df[facet_column].unique()
color_mapper = factor_cmap(group_column, palette=Category10[8], 
    factors=group_values)

plots = []

for value in facet_values:
    df_facet = df[df[facet_column] == value]
    grouped = df_facet.groupby(group_column)[value_column]
    categories = list(grouped.groups.keys())

    q1 = grouped.quantile(q=0.25)
    q2 = grouped.quantile(q=0.5)
    q3 = grouped.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    source = ColumnDataSource(
        pd.DataFrame(
            dict(cat=categories, q1=q1, q2=q2, q3=q3, upper=upper, lower=lower)
        )
    )

    p = figure(
        background_fill_color="#efefef", x_range=group_values, y_range=[y_min, y_max]
    )
    p.title.text = value

    p.segment(categories, upper, categories, q3)
    p.segment(categories, lower, categories, q1)

    p.vbar(
        x="cat",
        width=0.7,
        top="q3",
        bottom="q2",
        source=source,
        fill_color=color_mapper,
        line_color="black",
    )
    p.vbar(
        x="cat",
        width=0.7,
        top="q2",
        bottom="q1",
        source=source,
        fill_color=color_mapper,
        line_color="black",
    )

    plots.append(p)

# Arrange the plots in a grid and display them
grid = gridplot(plots, ncols=5, width=200, height=200)
output_notebook()
show(grid)


In [94]:
import pandas as pd
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.layouts import gridplot
from bokeh.palettes import Category10

df = pd.read_csv("data/movies.csv")

group_column = 'MPAA Rating'
facet_column = 'Major Genre'
value_column = 'Production Budget'

df[group_column] = df[group_column].fillna('Unknown')
df[facet_column] = df[facet_column].fillna('Unknown')
y_min = df[value_column].min()
y_max = df[value_column].max()

group_values = df[group_column].unique()
facet_values = df[facet_column].unique()
color_mapper = factor_cmap(group_column, palette=Category10[8], factors=group_values)

plots = []

for value in facet_values:    
    df_facet = df[df[facet_column] == value]
    grouped = df_facet.groupby([group_column, facet_column])[value_column]
    categories = list(grouped.groups.keys())
    
    q1 = grouped.quantile(q=0.25)
    q2 = grouped.quantile(q=0.5)
    q3 = grouped.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5*iqr
    lower = q1 - 1.5*iqr

    # Create a new DataFrame to store the calculated values for each category
    source = ColumnDataSource(pd.DataFrame({
        'x_label': [f"{cat[0]}_{cat[1]}" for cat in grouped.groups.keys()],
        'group': [cat[0] for cat in grouped.groups.keys()],
        'color_group': [cat[1] for cat in grouped.groups.keys()],
        'q1':q1, 'q2':q2, 'q3':q3, 'upper':upper, 'lower' : lower
    }))

    p = figure(background_fill_color="#efefef", x_range=group_values, y_range=[y_min, y_max])
    p.title.text = value

    p.segment(categories, upper, categories, q3)
    p.segment(categories, lower, categories, q1)

    p.vbar(x='cat', width=0.7, top='q3', bottom='q2', source=source, fill_color=color_mapper, line_color="black")
    p.vbar(x='cat', width=0.7, top='q2', bottom='q1', source=source, fill_color=color_mapper, line_color="black")

    plots.append(p)

# Arrange the plots in a grid and display them
grid = gridplot(plots, ncols=5, width=200, height=200)
output_notebook()
show(grid)


In [17]:
import pandas as pd
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, FactorRange
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.layouts import gridplot
from bokeh.palettes import Category10


df = pd.read_csv("data/movies.csv")

group_column = "MPAA Rating"
facet_column = "Major Genre"
value_column = "Production Budget"

df[group_column] = df[group_column].fillna("Unknown")
df[facet_column] = df[facet_column].fillna("Unknown")
y_min = df[value_column].min()
y_max = df[value_column].max()

group_values = df[group_column].unique()
facet_values = df[facet_column].unique()
color_mapper = factor_cmap(group_column, palette=Category10[10], factors=group_values)

grouped = df.groupby(group_column)[value_column]

q1 = grouped.quantile(q=0.25)
q2 = grouped.quantile(q=0.5)
q3 = grouped.quantile(q=0.75)
iqr = q3 - q1
upper = q3 + 1.5 * iqr
lower = q1 - 1.5 * iqr

source = ColumnDataSource(
    pd.DataFrame(dict(cat=group_values, q1=q1, q2=q2, q3=q3, upper=upper, lower=lower))
)

p = figure(x_range=group_values, y_range=[y_min, y_max])
p.segment(group_values, upper, group_values, q3, line_color="black")
p.segment(group_values, lower, group_values, q1, line_color="black")

p.vbar(
    x="cat",
    width=0.7,
    top="q3",
    bottom="q2",
    source=source,
    fill_color=color_mapper,
    line_color="black",
)
p.vbar(
    x="cat",
    width=0.7,
    top="q2",
    bottom="q1",
    source=source,
    fill_color=color_mapper,
    line_color="black",
)
output_notebook()
show(p)


In [54]:
import pandas as pd
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
from bokeh.io import show
from bokeh.palettes import viridis
import numpy as np

df = pd.read_csv("data/movies.csv")
df['MPAA Rating'] = df['MPAA Rating'].fillna('NaN')

categories = df['MPAA Rating'].unique()

# Define common bin edges
bin_num = 50
bin_edges = np.linspace(df['Production Budget'].min(), df['Production Budget'].max(), bin_num)

# Initialize a new DataFrame for the restructured data
df2 = pd.DataFrame({'bins': bin_edges[:-1]})
for i, category in enumerate(categories):
    hist, _ = np.histogram(df[df['MPAA Rating'] == category]['Production Budget'].dropna(), bins = bin_edges)
    df2[category] = hist

# Create a ColumnDataSource from df2
source = ColumnDataSource(df2)

# Define the palette
palette = viridis(len(categories))

p = figure(height = 600, width = 600, 
           title ='Histogram of Production Budget',
           x_axis_label = 'Production Budget',
           y_axis_label = 'Count')

# Draw the stacked bars
p.vbar_stack(stackers=categories.tolist(), x='bins', source=source,
             width=np.diff(bin_edges)[0], color=palette, alpha=0.7, legend_label=categories.tolist())


show(p)


In [63]:
import pandas as pd
from bokeh.io import show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource

df = pd.read_csv("data/movies.csv")

hist, edges = np.histogram(df['Production Budget'].dropna())

p = figure()
p.line(edges[:-1], hist)

show(p)

In [103]:
import pandas as pd
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.transform import dodge
from bokeh.palettes import Category10

df = pd.read_csv("data/movies.csv")


df["MPAA Rating"] = df["MPAA Rating"].fillna("Unknown")
df["Major Genre"] = df["Major Genre"].fillna("Unknown")
y_min = df["Production Budget"].min()
y_max = df["Production Budget"].max()

group_values = df["MPAA Rating"].unique()
width = 0.8 / len(group_values)
color_mapper = factor_cmap("MPAA Rating", palette=Category10[8], factors=group_values)

plots = []

p = figure(x_range=df["Major Genre"].unique(), y_range=[y_min, y_max])
for i, value in enumerate(df["MPAA Rating"].unique()):
    df_facet = df[df["MPAA Rating"] == value]
    grouped = df_facet.groupby("Major Genre")["Production Budget"]
    categories = list(grouped.groups.keys())

    q1 = grouped.quantile(q=0.25)
    q2 = grouped.quantile(q=0.5)
    q3 = grouped.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    source = ColumnDataSource(
        pd.DataFrame(
            dict(cat=categories, q1=q1, q2=q2, q3=q3, upper=upper, lower=lower)
        )
    )
    dodger = dodge( "Major Genre", width * i - 0.5, range=p.x_range )


    p.segment(dodger, "upper", dodger, "q3", source=source,
        line_color="black")
    p.segment(dodger, "lower", dodger, "q1", source=source,
        line_color="black")

    p.vbar(
        x=dodger,
        width=width,
        top="q3",
        bottom="q2",
        source=source,
        fill_color=Category10[8][i],
        line_color="black",
    )
    p.vbar(
        x=dodger,
        width=width,
        top="q2",
        bottom="q1",
        source=source,
        fill_color=Category10[8][i],
        line_color="black",
    )

show(p)

In [132]:
import pandas as pd
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.transform import dodge
from bokeh.palettes import Category10

df = pd.read_csv("data/movies.csv")


df["MPAA Rating"] = df["MPAA Rating"].fillna("Unknown")
df["Major Genre"] = df["Major Genre"].fillna("Unknown")
y_min = df["Production Budget"].min()
y_max = df["Production Budget"].max()

group_values = df["MPAA Rating"].unique()
width = 0.8 / len(group_values)
color_mapper = factor_cmap("MPAA Rating", palette=Category10[8], factors=group_values)

plots = []

p = figure(x_range=df["Major Genre"].unique(), y_range=[y_min, y_max])
for i, (label, df_facet) in enumerate(df.groupby("MPAA Rating")):
    grouped = df_facet.groupby("Major Genre")["Production Budget"]
    categories = list(grouped.groups.keys())

    q1 = grouped.quantile(q=0.25)
    q2 = grouped.quantile(q=0.5)
    q3 = grouped.quantile(q=0.75)
    iqr = q3 - q1
    upper = q3 + 1.5 * iqr
    lower = q1 - 1.5 * iqr

    source = pd.DataFrame(
        dict(cat=categories, q1=q1, q2=q2, q3=q3, upper=upper, lower=lower)
    )
    dodger = dodge("Major Genre", width * i - 0.5, range=p.x_range)

    p.segment(dodger, "upper", dodger, "q3", source=source, line_color="black")
    p.segment(dodger, "lower", dodger, "q1", source=source, line_color="black")

    p.vbar(
        x=dodger,
        width=width,
        top="q3",
        bottom="q2",
        source=source,
        fill_color=Category10[8][i],
        line_color="black",
    )
    p.vbar(
        x=dodger,
        width=width,
        top="q2",
        bottom="q1",
        source=source,
        fill_color=Category10[8][i],
        line_color="black",
    )
show(p)

In [140]:
from bokeh.palettes import Category10_10

In [141]:
Category10_10[8]

'#bcbd22'

: 