In [1]:
import pandas as pd
import numpy as np
from collections import OrderedDict

from bokeh.plotting import figure,show
from bokeh.layouts import row,column,layout
from ipywidgets import interact 
from bokeh.io import curdoc,curstate
from bokeh.layouts import widgetbox

import warnings
warnings.filterwarnings("ignore")

from bokeh.models.formatters import CategoricalTickFormatter,NumeralTickFormatter
from bokeh.models import (
    HoverTool,
    LinearColorMapper,
    PrintfTickFormatter,
    ColumnDataSource,
    Slider
)
import bokeh.palettes as pt
import time
from bokeh.models.transforms import LinearInterpolator
from  bokeh.models.mappers import CategoricalColorMapper

from bokeh.events import Tap,ButtonClick
from bokeh.models import Button
from bokeh.models.widgets import Div,Paragraph
from bokeh.client import push_session


In [2]:
from bokeh.io import output_notebook, push_notebook
output_notebook()

In [3]:
reviews=pd.read_csv('./Reviews.csv')
reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
products=reviews.groupby('ProductId')
print(products.get_group(('B001E4KFG0')))

product_dist=reviews['ProductId'].value_counts()
#type(product_dist) #type series

#product_dist=products.sum()
#print(product_dist) #This gives the sum of all the columns associated within each group.
prod_dist_dict=product_dist.to_dict()
print(product_dist.head())
print("Number of unique products: ",len(prod_dist_dict))

   Id   ProductId          UserId ProfileName  HelpfulnessNumerator  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW  delmartian                     1   

   HelpfulnessDenominator  Score        Time                Summary  \
0                       1      5  1303862400  Good Quality Dog Food   

                                                Text  
0  I have bought several of the Vitality canned d...  
B007JFMH8M    913
B002QWP89S    632
B002QWP8H0    632
B0026RQTGE    632
B002QWHJOU    632
Name: ProductId, dtype: int64
Number of unique products:  74258


In [5]:
#Number of products with over 500 reviews.
print("Number of products with over 500 reviews: ",product_dist[product_dist>500].size)
print("Number of products with over 100 reviews: ",product_dist[product_dist>100].size)
print("Number of products with over 50 reviews: ",product_dist[product_dist>50].size)

def get_products_num_reviews(num_reviews):
    #default - above a certain number
    s=product_dist[product_dist>num_reviews]
    return s.keys().tolist()

def get_num_reviews_product(product_id):
    return prod_dist_dict[product_id]

def get_scores(product_id):
    num_reviews=prod_dist_dict[product_id]
    prod_df=products.get_group((product_id))
    prod_df=prod_df[['Score','Summary','Text']]
    mean_scores=prod_df['Score'].mean()
    max_scores=prod_df['Score'].max()
    min_scores=prod_df['Score'].min()
    median_scores=prod_df['Score'].median()
    scores=prod_df['Score'].value_counts().sort_index(ascending=False).to_dict()
    return (num_reviews,mean_scores,median_scores,min_scores,max_scores,scores)


Number of products with over 500 reviews:  29
Number of products with over 100 reviews:  862
Number of products with over 50 reviews:  1799


In [None]:
score_cols=["Score"+str(i) for i in range(1,6)]
pdict=pd.DataFrame([],columns=['ProductId','NumReviews','Mean_Scores','Median_Scores',\
                            'Min_Scores','Max_Scores']+score_cols)
i=0

start=time.time()
for product_id in prod_dist_dict.keys():
    (num_reviews,mean_scores,median_scores,min_scores,max_scores,scores)=get_scores(product_id)
    
    Score_1=scores.get(1.0,0) 
    Score_2=scores.get(2.0,0)
    Score_3=scores.get(3.0,0)
    Score_4=scores.get(4.0,0)
    Score_5=scores.get(5.0,0)
    
    dfrow=dict(ProductId=product_id,NumReviews=num_reviews,Mean_Scores=mean_scores,\
                Median_Scores=median_scores,Min_Scores=min_scores,\
                Max_Scores=max_scores,\
                Score1=Score_1,Score2=Score_2,   Score3=Score_3,   Score4=Score_4,   Score5=Score_5 
               )
    dfrow=pd.DataFrame(dfrow,index=[i])
    i=i+1
    #if(i==5):break
    pdict=pd.concat([pdict,dfrow])

print("Time to build df:",time.time()-start)
print(pdict.head())

In [None]:
#build a histogram/bar chart with the score distriibution
source1=ColumnDataSource(data=dict(x=[],y=[]))#ColumnDataSource(scores)
source2=ColumnDataSource(data=dict(x=[],y=[],labels=[],colors=[],sizes=[],alphas=[]))

bplot=None

def get_barchart(x):
    global source1,source2,bplot
    display_df=source2.to_df()
    product_id=display_df.ix[int(x),'labels']
    print("product_id:",product_id)
    
    scores=reviews_df[['ProductId','Score1','Score2','Score3','Score4','Score5']]    
    selected=scores.ix[scores['ProductId']==product_id,1:] #only get the scores.
    selected=selected.reset_index(drop=True)
    
    x_range_=selected.keys().tolist()
    y_range_=selected.loc[0]
    print(x_range_,y_range_)
    
    p=pt.viridis(5)
    mapper=CategoricalColorMapper(palette=p,factors=x_range_)
    source1.data['x']=x_range_
    source1.data['y']=y_range_
    print(source1.data)
    
    Tools="hover,tap"    
    bplot=figure(x_range=x_range_, plot_height=350,plot_width=300,tools=Tools, toolbar_location='right',\
                 title="Score distribution: "+\
                product_id)
    bplot.vbar(x='x',top='y',bottom=0,width=0.5,source=source1,legend='Scores',\
               fill_color={'field':'x','transform':mapper}) 
    
    bplot.legend.orientation = "horizontal"
    bplot.legend.location = "top_center"
    
    return bplot


def update_barchart(x):
    global source1,source2,bplot
    display_df=source2.to_df()
    product_id=display_df.ix[int(x),'labels']
    print("product_id:",product_id)
    
    scores=reviews_df[['ProductId','Score1','Score2','Score3','Score4','Score5']]    
    selected=scores.ix[scores['ProductId']==product_id,1:] #only get the scores.
    selected=selected.reset_index(drop=True)
    
    x_range_=selected.keys().tolist()
    y_range_=selected.loc[0]
    print(x_range_,y_range_)
    
    bplot.title.text="Score distribution: "+ product_id
    source1.data['x']=x_range_
    source1.data['y']=y_range_
    
#get_barchart('B002QWP89S')

In [None]:
#Make a scatter plot associated with a slider widget 
#create two sources -one with all the initial data and another with the 
#points that are actually being rendered.
#reviews_df=pd.DataFrame(pdict,columns=['ProductId','NumReviews'],index=list(range(len(prod_dist_dict))))
#since the pdict above itself is a dataframe
reviews_df=pdict
p1=None

def update(attr,new,old):
    global source2
    print(attr,new,old)
    nr=slider.value
    product_ids=get_products_num_reviews(nr)
    product_series=product_dist.ix[product_ids]
    df=reviews_df[reviews_df['ProductId'].isin(product_ids)]
    source2.data['x']=list(range(len(df)))
    source2.data['y']=df['NumReviews']
    source2.data['labels']=df['ProductId']
    source2.data['colors']=df['Mean_Scores']
    source2.data['sizes']=df['Max_Scores']
    source2.data['alphas']=df['Min_Scores']
 

PLOT_OPTS=dict(height=400,width=600,toolbar_location="above")
colors=pt.viridis(20)

mapper = LinearColorMapper(palette=colors, low=reviews_df['Mean_Scores'].min(), high=reviews_df['Mean_Scores'].max())
size_mapper = LinearInterpolator(y=[5,10], x=[reviews_df['Max_Scores'].min(),reviews_df['Max_Scores'].max()])
alpha_mapper = LinearInterpolator(y=[0.2,1.0], x=[reviews_df['Min_Scores'].min(),reviews_df['Min_Scores'].max()])

TOOLS = "save,pan,box_select,reset,wheel_zoom"
hover=HoverTool(tooltips=[('ProductID:',"@labels"),\
                          ('Mean_Score:',"@colors"),\
                         ('Max_Score:',"@sizes"),\
                         ('Min_Score:',"@alphas")],show_arrow=False)


plot = figure(**PLOT_OPTS,tools=TOOLS)
plot.circle('x', 'y', source=source2,\
            fill_color={'field':'colors', 'transform': mapper},
            size={'field':'sizes','transform':size_mapper},
            fill_alpha={'field':'alphas','transform':alpha_mapper})
plot.add_tools(hover)

plot.xaxis.axis_label='ProductNumber'
plot.yaxis.axis_label='Num Reviews'
plot.title.text='Amazon Food Reviews'
plot.xaxis.major_label_orientation = np.pi/4

row_plots=None
column_plots=None
div=None


def add_score_distribution(event):
    global row_plots,column_plots,bplot,div
    
    cls_name = event.__class__.__name__
    attributes=['x','y','sx','sy']
    attrs = ', '.join(['{attr}={val}'.format(attr=attr, val=event.__dict__[attr])
                       for attr in attributes])
    b='{cls_name}({attrs})'.format(cls_name=cls_name, attrs=attrs)
    print(b)
    div = Paragraph(text="""%s"""%(b))
    
    if(bplot==None):
        bplot=get_barchart(event.__dict__['x'])
        
        print("sravika",bplot)
        rootLayout = row_plots
        listOfSubLayouts = rootLayout.children
        cplot=column(children=[bplot,widgetbox(div)],name='sub_p2')
        listOfSubLayouts.append(cplot)
        
    else:
        print(event.__dict__['x'])
        div.text="""%s"""%(b)
        update_barchart(event.__dict__['x'])
        
plot.on_event(Tap,add_score_distribution)


def interact_update(nr):
    global source2
    product_ids=get_products_num_reviews(nr)
    product_series=product_dist.ix[product_ids]
    df=reviews_df[reviews_df['ProductId'].isin(product_ids)]
    source2.data['x']=list(range(len(df)))
    source2.data['y']=df['NumReviews']
    source2.data['labels']=df['ProductId']
    source2.data['colors']=df['Mean_Scores']
    source2.data['sizes']=df['Max_Scores']
    source2.data['alphas']=df['Min_Scores']
    print(len(source2.data['x']),len(source2.data['y']))
    push_notebook()
    

slider = Slider(start=0, end=600, value=500, step=100, title="Above NumReviews")
slider.on_change('value', update)

column_plots=column(children=[plot,widgetbox(slider)],name='sub_p1')
row_plots=row(children=[column_plots],name='p1')

p1=row_plots
curdoc().add_root(p1)

p=plot


In [None]:
t=show(p,notebook_handle=True)
interact(interact_update,nr=(0,600,100))

In [None]:
def get_mean_scores(product_id):
    num_reviews=prod_dist_dict[product_id]
    prod_df=products.get_group((product_id))
    prod_df=prod_df[['Score','Summary','Text']]
    mean_scores=prod_df['Score'].mean()
    mode_scores=prod_df['Score'].mode()
    median_scores=prod_df['Score'].median()
    scores=prod_df['Score']
    return (num_reviews,mean_scores,median_scores,mode_scores,scores)

for product_id in get_products_num_reviews(500):
    print(product_id)
    (num_reviews,mean_scores,median_scores,mode_scores,scores)=get_mean_scores(product_id)
    print("Num: ",num_reviews,"Mean: ",mean_scores,"Median: ",median_scores,"Mode: ",mode_scores)
#build a print("Scores",scores.value_counts().sort_index(ascending=False))