# Bokeh Visualizations Topics
This notebook is for exploratory visual analysis using Bokeh of the topic models

In [1]:
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource, DataTable, TableColumn, Legend
from bokeh.models.tools import HoverTool
from bokeh.transform import factor_cmap, factor_mark, linear_cmap, jitter
from bokeh.io import output_notebook
import colorcet as cc
import numpy as np

from bokeh.palettes import viridis

import csv
from pathlib import Path

## Set Up Spaces and Load Data
These set the working folder, name the topic files, and name the key file. It also sets up the way in which Bokeh will display the data, in this case it is outputting into this notebook. This run of the topic modeling created 50 topics from the invention keyword search.

In [2]:
working_folder = Path.home() / ("syncthing/Dissertation/dissertation_data/dissertation_data_working_folder/topic_modeling")

topic_file = Path(working_folder / ('techne_by_year.csv'))
key_file = Path(working_folder / ('techne_keys.csv'))

#output_file("invention_by_year.html",title="Bokeh 50 Topics Plot by Year")

output_notebook()

topics = pd.read_csv(topic_file)
keys = pd.read_csv(key_file)

inv_mentions = topics.groupby("year")["filename"].count().reset_index(name="count")
#keys_list = keys['topic'].tolist()

## Documents Contained per year
This code block sets up the visualization of bar graphs per year of documents that contain the keyword. In this case, techne. 

In [3]:
table_source = ColumnDataSource(inv_mentions)

table = figure(width=720, x_minor_ticks=2)

colors=viridis(10)
color_map = linear_cmap(field_name='year', palette=colors, low=1990, high=1999)

table.vbar(x='year', top='count',source=table_source, width=0.70, color=color_map)
#table.title.text = 'Article Containing "techne" per year'
table.xaxis.axis_label = 'Year'
table.yaxis.axis_label = 'Article Count'
table.xaxis[0].ticker.desired_num_ticks = 10
hover = HoverTool()
hover.tooltips=[
    ('Year', '@year'),
    ('Count', '@count')
]

table.add_tools(hover)
show(table)

## Setup Data for Snapshot Visualization
This code block creats a list of all of the topics for *techne*. It then removes via masking all values in the selected columns with a value less than .05

- column_list is a variable which stores the index key numbers for the highlighted topics

In [4]:
column_list=[]
for i in range(50):
    i = str(i)
    column_list.append(i)

for column in column_list:
    values = topics[column].values
    mask = values < 0.05
    values[mask] = np.nan
    topics[column] = values

## Calculate Minimum, Maximum, and Mean Weights
This next code block calculates the minimum, maximum, and means weights for each topic per year for the dataset. It then takes these calculated weight values and saves them into an excel spreadsheet for cross-referencing.

In [5]:
column_list.append('year')
topics_min = topics[column_list].groupby(['year']).min()
topics_max = topics[column_list].groupby(['year']).max()
topics_mean = topics[column_list].groupby(['year']).mean()

output_excel_file = Path(working_folder / ('techne_filtered_topics.xlsx'))

writer = pd.ExcelWriter(output_excel_file, engine = 'xlsxwriter')
topics_mean.to_excel(writer, sheet_name="Mean")
topics_min.to_excel(writer, sheet_name="Min")
topics_max.to_excel(writer, sheet_name="Max")
writer.save()

column_list.remove('year')

## Create cross-reference datatable
This code block prepares the data for the cross-referencing table of keys, topics, and interpretations. It also prepares the data for visualization with bokeh using the variable data_table

In [6]:
topics = topics.groupby(['year']).mean()
topics = topics.reset_index()
source = ColumnDataSource(topics)

keys_source = ColumnDataSource(keys)

columns = [
    TableColumn(field="key", title="Key"),
    TableColumn(field="interpretation", title="Subjective Reading"),
    TableColumn(field="topic", title="Topics")
    ]
data_table = DataTable(source=keys_source, index_position=None, columns=columns, autosize_mode='fit_viewport')

## Visualizations
This next block uses Bokeh to create the interactive visualizations. It prepares the data and labels for the graph, sets up the hover tooltips, outputs html files, and displays the visualizations in line.

In [7]:
p = figure(output_backend="webgl", width=800, height=1000)

color = cc.glasbey
    
for column in column_list:
    x=jitter('year', 0.5)
    p.scatter(x=x, y=column, source=source, color=color[int(column)], size=10, name=column, muted_color=color[int(column)], muted_alpha=0.2, legend_label=column)
    p.line(x=x, y=column, source=source, color=color[int(column)], line_width=2, name=column, muted_color=color[int(column)], muted_alpha=0.2, legend_label=column)
    #, legend_label=column
    
p.xaxis.axis_label = "Year Published"
p.yaxis.axis_label = "Topic Mean"
p.title = "Techne Topic Means per Year > .05"
p.xaxis[0].ticker.desired_num_ticks = 10
p.xaxis[0].ticker.num_minor_ticks = 0

p.legend.location = "right"
p.legend.click_policy="mute"

legend = p.legend[0]

hover = HoverTool()
hover.tooltips=[
    ('Year', '@year'),
    ('Topic Key', '$name'),
    ('Topic Match', '@$name{0.0000}')
]

p.add_tools(hover)
p.add_layout(legend, 'right')
output_graph = Path(working_folder / ('techne_visualization.html'))
output_file(output_graph)
show(p)
output_table = Path(working_folder / ('techne_datatable.html'))
output_file(output_table)
show(data_table)