In [47]:
#Run this with >bokeh serve --show TextAnalysis.ipynb
import pandas as pd
import numpy as np
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
from bokeh.io import curdoc,curstate
from bokeh.layouts import widgetbox
import time

from bokeh.layouts import layout,column,row
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource,Slider
from bokeh.palettes import Spectral6

from bokeh.models.widgets import Panel, Tabs
from bokeh.models.callbacks import CustomJS

In [30]:
from bokeh.io import output_notebook, push_notebook
output_notebook()

In [4]:
reviews=pd.read_csv('./Reviews.csv')
text_df=reviews[['Summary','Text']]
text_df.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


In [26]:
summary=text_df['Summary'].tolist()
text=text_df['Text'].tolist()
print("Num records:",len(summary))

def tokenize(sentence):
    return str(sentence).split()

summary_tokens=list(map(tokenize,summary[:50000]))
text_tokens=list(map(tokenize,text[:50000]))
print("Num records tokens:",len(summary_tokens),len(text_tokens))

summary_tokens_len=list(map(len,summary_tokens))
text_tokens_len=list(map(len,text_tokens))

Num records: 568454
Num records tokens: 50000 50000


In [27]:

plot = figure(title="TextAnalysis",tools="save,hover,pan",width=400,height=400,
            background_fill_color="#E8DDCB")

hist, edges = np.histogram(text_tokens_len, density=True, bins=50)
plot.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
plot.legend.location = "center_right"
plot.legend.background_fill_color = "darkgrey"
plot.xaxis.axis_label = 'x'
plot.yaxis.axis_label = 'Count(x)'

plot2 = figure(title="SumaryAnalysis",tools="save,hover,pan",width=400,height=400,
            background_fill_color="#E8DDCB")

hist, edges = np.histogram(summary_tokens_len, density=True, bins=50)
plot2.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color="#036564", line_color="#033649")
plot2.legend.location = "center_right"
plot2.legend.background_fill_color = "darkgrey"
plot2.xaxis.axis_label = 'x'
plot2.yaxis.axis_label = 'Count(x)'

show(row(plot,plot2))


In [61]:
#Apply a slider widget that controls the number of records to use.
#Best done with holoviews dynamic maps.
def get_text_counts(n_docs=10000):
    global summary,text
    summary_tokens=list(map(tokenize,summary[:n_docs]))
    text_tokens=list(map(tokenize,text[:n_docs]))
    summary_dict=Counter(list(map(len,summary_tokens)))
    text_dict=Counter(list(map(len,text_tokens)))
    
    return dict(text_idx=text_dict.keys(),text_counts=text_dict.values(),\
               summary_idx=summary_dict.keys(),summary_counts=summary_dict.values())

slider=None
tabs=None

def update_plot(attr,old,new):
    global tabs,slider,source
    txt_sum=tabs.active
    print("Active= ",txt_sum)
    
    n_docs=slider.value
    data=get_text_counts(n_docs=n_docs)
    if(txt_sum==0):
        source.data=dict(idx=list(data['text_idx']),counts=list(data['text_counts']),\
                        summary_idx=list(data['summary_idx']),summary_counts=list(data['summary_counts']),\
                        text_idx=list(data['text_idx']),text_counts=list(data['text_counts']))
    else:
        source.data=dict(idx=list(data['summary_idx']),counts=list(data['summary_counts']),\
                        summary_idx=list(data['summary_idx']),summary_counts=list(data['summary_counts']),\
                        text_idx=list(data['text_idx']),text_counts=list(data['text_counts']))
        
        
        
source=ColumnDataSource(data=dict(test_idx=[],test_counts=[],\
                                   summary_idx=[],summary_counts=[],\
                                   idx=[],counts=[]))        
    
callback = CustomJS(args=dict(source=source), code="""
        var data = source.data;
        var f = cb_obj.value
        
        x = data['idx']
        y = data['counts']

        if(f==0)
           {
             x=source.data['text_idx']
             y=source.data['text_counts']
           }
         else{
             x=source.data['summary_idx']
             y=source.data['summary_counts'] 
         }  
        source.change.emit();
    """)
        


slider = Slider(start=1000, end=100000, value=1000, step=1000,
                    title="Number of docs", orientation='horizontal')

slider.on_change('value',update_plot)
p1 = figure( plot_height=350, toolbar_location='above', title="Text Counts")
p1.vbar(x='idx', top='counts', width=0.9, source=source, legend="Counts",
       line_color='red')

p1.xgrid.grid_line_color = None
#p1.y_range.start = 0
#p1.y_range.end = 9
p1.legend.orientation = "horizontal"
p1.legend.location = "top_left"
tab1 = Panel(child=p1, title="Text")

p2 = figure( plot_height=350, toolbar_location='above', title="Summary Counts")
p2.vbar(x='idx', top='counts', width=0.9, source=source, legend="Counts",
       line_color='blue')

p2.xgrid.grid_line_color = None
#p2.y_range.start = 0
#p2.y_range.end = 9
p2.legend.orientation = "horizontal"
p2.legend.location = "top_left"
tab2 = Panel(child=p2, title="Summary")


tabs=Tabs(tabs=[tab1,tab2],callback=callback)

plot=row(tabs,widgetbox(slider))
curdoc().add_root(plot)




In [62]:
from ipywidgets import interact

def update(s):
    global tabs,source
    txt_sum=tabs.active
    print("Active= ",txt_sum)
    
    n_docs=s
    data=get_text_counts(n_docs=n_docs)
    if(txt_sum==0):
        source.data=dict(idx=list(data['text_idx']),counts=list(data['text_counts']),\
                        summary_idx=list(data['summary_idx']),summary_counts=list(data['summary_counts']),\
                        text_idx=list(data['text_idx']),text_counts=list(data['text_counts']))
    else:
        source.data=dict(idx=list(data['summary_idx']),counts=list(data['summary_counts']),\
                        summary_idx=list(data['summary_idx']),summary_counts=list(data['summary_counts']),\
                        text_idx=list(data['text_idx']),text_counts=list(data['text_counts']))
        
    push_notebook()

show(plot,notebook_handle=True)    
interact(update,s=(1000,100000,1000))

<function __main__.update>