In [25]:
import pandas as pd
import numpy as np
from bokeh.plotting import figure
from bokeh.io import show, output_notebook,output_file
from bokeh.models import ColumnDataSource,FactorRange,HoverTool
from bokeh.palettes import Spectral6
from bokeh.transform import factor_cmap

In [11]:
df = pd.read_csv("results_final.csv")

In [12]:
colors = {
    "ORGANIZATION" : "#e2af91",
    "LOCATION" : "#83ef00",
    "PERSON" : "#ffb800",
    "O" : "#83ef00"
}
ps = [colors[val] for val in df["Type"].values]


## First 100 Named Entities

In [38]:
output_file("100_bars.html", title = "Top100")

limit = 100

ners = df["NE"][1:limit].values
counts = df["Count"][1:limit].values
color = [colors[val] for val in df["Type"][:limit].values][1:]
custom_ners = ["" for ne in ners]
label = df["Type"][:limit].values
new_label = [l if l != "O" else "LOCATION" for l in label][1:]
label = new_label
source = ColumnDataSource(data=dict(ners=ners, counts=counts, color=color, label = label))

TOOLTIPS = [
    ("index","$index"),
    ("Word","@ners"),
    ("Count","@counts")
]

p = figure(x_range=ners, plot_height=450, plot_width = 750,title="Top 100 Named Entities",
           toolbar_location=None,tooltips = TOOLTIPS)


p.vbar(x="ners", top="counts", width=0.9,  source = source , color = "color", legend_group = "label")

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_text_font_size = '0pt'
output_notebook()
show(p)

### Last 500

In [37]:
output_file("last_bars.html",title = "last500")

val = 500
ners = df["NE"][-val:].values
counts = df["Count"][-val:].values
color = [colors[val] for val in df["Type"][-val:].values]
custom_ners = ["" for ne in ners]
label = df["Type"][-val:].values
new_label = [l if l != "O" else "LOCATION" for l in label]
label = new_label


source = ColumnDataSource(data=dict(ners=ners, counts=counts, color=color,label = label))

TOOLTIPS = [
    ("index","$index"),
    ("(x,y)", "($x,$y)"),
    ("desc","@ners")
]

p = figure(x_range=ners, plot_height=450, plot_width = 750,title="Last 500 named entities",
           toolbar_location=None,tooltips = TOOLTIPS)


p.vbar(x="ners", top="counts", width=0.9,  source = source , color = "color",legend_group = "label")

p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_text_font_size = '0pt'
output_notebook()

show(p)

### Total Distribution

In [36]:
df_o = df[df["Type"] != "O"]
new_df = df_o.groupby("Type").count()


output_file("overall_bars.html", title = "Overall Distribution")


ners = new_df.index.values
counts = new_df.Count.values
color = [colors[val] for val in ners]
legend = new_df.index.values

source = ColumnDataSource(data=dict(ners=ners, counts=counts, color=color, legend = legend))

TOOLTIPS = [
    ("index","$index"),
    ("(x,y)", "($x,$y)"),
    ("desc","@ners"),
    ("count","@counts")
]

p = figure(x_range=ners, plot_height=250, plot_width = 700,title="NER counts",
           toolbar_location=None,tooltips = TOOLTIPS)


p.vbar(x="ners", top="counts", width=0.6,  source = source , color = "color", legend_group = "legend")

p.xgrid.grid_line_color = None
p.y_range.start = 0

output_notebook()

show(p)

### Classification Outputs for targetted Named Entities

In [46]:
results_df = pd.read_csv("1_myDump_final.csv")
cls_results = pd.read_csv("new_myDump_final.csv")

In [47]:
check_df = pd.merge(results_df,cls_results[['word','accuracy']],on='word', how='left')
check_df["accuracy_y"].fillna(0.8960546105263157, inplace = True)

In [49]:
output_file("line_comp.html", title = "Classification output")
df_plot = pd.DataFrame({'A':check_df.accuracy_x.values , 'B': check_df.accuracy_y.values}, index=check_df.index.values)
df_plot['dates'] = check_df.index.values
df_plot["word"] = check_df.word.values
df_plot["mat"] = check_df.mat.values
l1 = ["layer11"] * len(df_plot['dates'])
l2 = ["CLS"] * len(df_plot['dates'])
df_plot["legend"] = l1
df_plot["legend_1"] = l2
source = ColumnDataSource(df_plot)
p = figure(plot_width=800, plot_height=400,title="Accuracy of Named Entities")
p.line('dates', 'A', source=source, color='#79D151', line_width=3,legend_group = "legend" )
p.line('dates', 'B', source=source, color='#404387',line_width=3,legend_group = "legend_1")
p.add_tools(HoverTool(tooltips=[("Word", "@word"),("Layer11", "@A"), ("CLS11", "@B")]))

show(p)

### Tagged output average

In [34]:
output_file("comparison.html",title = "average pages")

check_df_new = pd.merge(check_df,df[['NE','Type']],left_on='word', right_on = "NE", how='left')

check_df_new_1 = check_df_new[check_df_new["Type"]!= "O"]
new_check_df_new = check_df_new_1[["accuracy_x","Type","accuracy_y"]].groupby("Type").mean()

ners = new_check_df_new.index.values
types = ['layer_11', 'cls']

data = {'ners' : ners,
        'layer_11'   : new_check_df_new.accuracy_x.values,
        'cls'   : new_check_df_new.accuracy_y.values
        }

# this creates [ ("Apples", "2015"), ("Apples", "2016"), ("Apples", "2017"), ("Pears", "2015), ... ]

x = [ (ner, t) for ner in ners for t in types ]

counts = sum(zip(data['layer_11'], data['cls']), ()) # like an hstack

source = ColumnDataSource(data=dict(x=x, counts=counts))

p = figure(x_range=FactorRange(*x), plot_height=250, title="Average Accuracies",
           toolbar_location=None, tools="")


p.vbar(x='x', top='counts', width=0.9, source=source,fill_color=factor_cmap('x', palette=['#3288bd', '#99d594'], factors=types, start=1, end=2))

p.y_range.start = 0
p.x_range.range_padding = 0.1
p.xaxis.major_label_orientation = 1
p.xgrid.grid_line_color = None

show(p)

In [33]:
import bokeh
bokeh.__version__

'1.4.0'