In [1]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
#!jupyter nbextension enable --py --sys-prefix widgetsnbextension
#from IPython import get_ipython
#get_ipython().magic('reset -sf') 
try:
    #del doc2VecModules.py
    del wmdModules.py
except :
    #import doc2VecModules
    import wmdModules
    
#import ipython_memory_usage.ipython_memory_usage as imu

import ipywidgets as widgets
from IPython.display import display
import numpy as np
import pandas as pd
import re, string
import matplotlib.pyplot as plt
#from termcolor import colored
from IPython.display import HTML
from IPython.display import clear_output
from ipywidgets import Layout
#%matplotlib notebook
plt.style.use('ggplot')
import matplotlib.gridspec as gridspec

#import matplotlib.animation as animation
import warnings
warnings.filterwarnings('ignore')

import wmdModules

In [3]:
def createDataFrame():
    df       = pd.read_csv("results/solution.csv")
    dfTarget = pd.read_csv("results/query.csv")
    count    = []
    percent  = []
    query    = tknz(dfTarget.iloc[0].token)
    nQuery   = len(query)
    for i in range(len(df)):
        #vect = tknz(df.iloc[i].tokenized)
        vect = tknz(df.iloc[i].tokenSent)
        nSent = len(vect)
        nShared = sum([w in vect for w in query])
        count.append(nShared)
        percent.append(nShared/nQuery)
    df["nShared"] = count
    df["percent"] = percent
    df["percent"].unique()
    return df, dfTarget, nQuery

In [4]:
# preprocess tokenized senteces
def tknz(sentence):
    sentence = sentence.strip("[,']")
    regex = re.compile(' [%s]' % re.escape(string.punctuation))
    sentence = regex.sub('', sentence)
    vect = sentence.strip(",").split("'")
    final = [w.strip(",") for w in vect]
    return final

def createTable(df,nQuery):
    bins = [i for i in range(nQuery+1)]
    ncounting = []
    for pp in bins:
        ncounting.append(df.loc[df.nShared==pp, 'nShared' ].count())
    totSent = sum(ncounting)
    table = pd.DataFrame({
        "nr. words" : bins,
        "percent" : [np.round(i/totSent,3) for i in ncounting],
        "freq"    : ncounting
    })
    table = table.reindex(columns=['nr. words', 'percent','freq'])
    return table
  
def createTableYear(df,periods):
    ncounting = []
    bins = periods
    for yy in bins:
      ncounting.append(df.loc[df.year==yy, 'year'].count())
    tableYear = pd.DataFrame({"year": bins, "freq":ncounting})
    tableYear = tableYear.reindex(columns=['year','freq'])
    return tableYear
  
def createTableWords(df,nInTable):
    vecs = []
    for i in range(len(df)):
      #vect = tknz(df.iloc[i].tokenized)
      vect = tknz(df.iloc[i].tokenSent)
      vecs.append(vect)
    dd = {key:0 for i in range(len(df)) for key in vecs[i] }
    for i in range(len(df)):
        for w in vecs[i]:
            dd[w] += 1

    words = sorted(dd, key=dd.get, reverse=True)[:nInTable]
    vals  = [dd[w] for w in words]
    percent = [np.round(v/nTop.value, 3) for v in vals]
    dfWords= pd.DataFrame({"word":words,"freq":vals, "percent": percent})
    dfWords = dfWords.reindex(columns=['word', 'percent','freq'])
    return dfWords
  
def createPicture(df, dfWords):   
    #fig = plt.gcf()
    plt.close()
    fig = plt.figure(figsize=(20,6))
    periods = np.arange(1796,1801)
    gs = gridspec.GridSpec(1,3)
   
    ax = plt.subplot(gs[0,0])
    bins = [i for i in range(nQuery+2)]
    ax.hist(df.nShared, bins=bins, edgecolor="white", linewidth=1.5, alpha=0.9, align="left")
    ax.set_xticks(bins[:-1])
    ax.set_title("Distribution of Sentences over Overlap Values")
    #plt.show()
    
    ax = plt.subplot(gs[0,1])
    bins= np.arange(np.min(periods), np.max(periods)+2)
    ax.hist(df.year, bins=bins, align="left", edgecolor="white", linewidth=1.5, alpha=0.9)
    ax.set_xticks(bins[:-1])
    ax.set_title("Distribution of Top Sentences over Time Period")
    
    ax = plt.subplot(gs[0,2])
    ax.bar(dfWords.word,dfWords.freq, edgecolor="white", linewidth=1.5, alpha=0.9)
    for tick in ax.get_xticklabels():
      tick.set_rotation(60)
    ax.set_title("Distribution of Words among Top Sentences")
    plt.show()


In [5]:
def createDialogAnalysis():
    global alpha
    global buttonRunAnalysis
    global paramAnalysis
    global ff
    # area : parameters with sliders
    capParam = widgets.Label(value='Select Parameters Value for Query Analysis', textsize=20, layout=Layout(width='50%', height='25px'))
    alpha=widgets.IntSlider(description=r'Overlap', value=0, min=0, max=nQuery, step = 1)
    buttonRunAnalysis = widgets.Button(description="Run Analysis")
    box1 = widgets.HBox([alpha,buttonRunAnalysis],layout=Layout(height="70px"))
    ff = widgets.FloatProgress(min=0, max=10,description="\t Progress ", layout=Layout(width='95%', height='20px')) # instantiate the bar
    paramAnalysis = widgets.VBox([capParam,box1,ff],layout=Layout(height="150px"))

def createDialogStatistics():
    global buttonRunStats
    global nInTable
    nInTable=widgets.IntSlider(description=r'Nr. Words', value=10, min=1, max=20, step = 1)

    buttonRunStats = widgets.Button(description="Run Statistics")
    paramStats = widgets.VBox([nInTable,buttonRunStats])
    return paramStats

def createDialogLoading():
    global checkboxes
    global buttonRunLoad
    global ft
    checkboxes = createCheckBoxesYears()
    caption = widgets.Label(value='Select Periods for Query', layout=Layout(width='90%', height='25px',right="True"))
    contCB = widgets.HBox([cb for cb in checkboxes])
    #contIn = widgets.VBox([contCB])
    
    #ft = widgets.FloatProgress(min=0, max=10,description="\t Progress ", layout=Layout(width='90%', height='20px')) # instantiate the bar
    #buttonRunLoad = widgets.Button(description="Load Corpus Query", layout=Layout(align_self="center"))
    #box = widgets.VBox([ft,buttonRunLoad])
    paramContainer = widgets.VBox([caption,contCB],layout=Layout(height="150px"))
    return paramContainer

def createDialogQuery():
    global buttonRunSearch
    global target
    global nTop
    global nPerYear
    global forbidden
    global penalty
    global fs
    # area 1 : Query search parameters
    capParam = widgets.Label(value='Select Parameters Value for Query Search ', textsize=20, layout=Layout(width='50%', height='25px'))
    target = widgets.Text(value='', description='Query:', layout=Layout(width='90%'), disabled=False)

    nTop      = widgets.IntText(value=1000, layout=Layout(width='30%'))
    #nPerYear  = widgets.IntText(value=500000, layout=Layout(width='30%'))
    nTop1     = widgets.HBox([widgets.Label('Nr. Sentences in Top List :', layout=Layout(width='50%')), nTop])
    #nPerYear1 = widgets.HBox([widgets.Label('Nr. Sentences Per Year :', layout=Layout(width='50%')), nPerYear])
    #sentSize  = widgets.VBox([nTop1, nPerYear1])
    sentSize  = widgets.VBox([nTop1])
    
    # select penalty value
    #penalty = widgets.Checkbox(value=False, description='With Penalty', disabled=False)   
    #penaltyBox = widgets.Box([penalty])
    
    #numericBox = widgets.HBox([sentSize,penaltyBox])
    numericBox = widgets.HBox([sentSize])
    
    # list of forbidden words
    forbidden = widgets.Text(value='', description='Forbid:', layout=Layout(width='90%'), disabled=False)


    fs = widgets.FloatProgress(min=0, max=10,description="\t Progress ", layout=Layout(width='90%', height='20px')) # instantiate the bar
    buttonRunSearch = widgets.Button(description="Run Query Search", layout=Layout(align_self="center"))
    #box0 = widgets.VBox([target,contIn,sentSize,fs,buttonRunSearch])
    box0 = widgets.VBox([target,numericBox,forbidden, fs,buttonRunSearch])
    paramContainer0 = widgets.VBox([capParam,box0],layout=box_layout)
    return paramContainer0


In [6]:
def initialize(nQuery, df):
    box_layout = Layout(display='flex',
                    flex_flow='column',
                    align_items='stretch',
                    border='solid',
                    width='100%')
    stepSize = 1.0/nQuery
    return stepSize, box_layout

def formattingOld(sent,i):
    block = "<li>" + sent.previous.strip("[] '") + " " 
    block+= "<b>" + sent.sentence.strip("[] '") + "</b>" + " "
    block+= sent.next.strip("[] '") + "</li>"
    block+= "<font color='#999999' size='1'>(" + str(i+1) + "-Sentence id = " + str(sent.id)
    block+= "; score = " + str(np.round(sent.score,3))
    block+= "; overlap = " + str(np.round(sent.percent,2))
    block+= "; keywords = " + sent.tokenized + ") </font>"
    return block
def formatting(sent,i):
    block = "<li>" + sent.prevSent.strip("[] '") + " " 
    block+= "<b>" + sent.sent.strip("[] '") + "</b>" + " "
    block+= sent.nextSent.strip("[] '") + "</li>"
    block+= "<font color='#999999' size='1'>(" + str(i+1) + "-Sentence id = " + str(sent.year) + "." + str(sent.idnr)
    block+= "; score = " + str(np.round(sent.score,3))
    block+= "; overlap = " + str(np.round(sent.percent,2))
    block+= "; keywords = " + sent.tokenSent + ") </font>"
    return block

def createHTMLBlocks(ddf,ff):
    if len(ddf) == 0:
      return []
    step = ff.max/len(ddf)
    ww = []
    for i in range(len(ddf)):
        ff.value += step
        block = formatting(ddf.iloc[i],i)
        ww.append(widgets.HTML(
            value=block))  
    return ww

def createCheckBoxesYears():
    period = np.arange(1796,1801)
    # area 3 : inexperienced categories (checkboxes)
    checkboxes = []
    for n in period:
        checkboxes.append(widgets.Checkbox(
        value=True,
        description=str(n),
        disabled=False
        ))
    return checkboxes

def getCheckBoxesPeriods():
    periods = []
    for cb in checkboxes:
        if cb.value == True:
            periods.append(int(cb.description))
    return periods
   
    
def setTitle(target):
    titleText = "<center><h4>Query : <b>" + target.query + "</b> </h4>"
    titleText += "<i>" + str(tknz(target.token)) + "</i></center>"
    return titleText



In [7]:
def on_buttonRunAnalysis_clicked(b):
      df, dfTarget, nQuery = createDataFrame()
      with outTitle:
        clear_output()
        title=widgets.HTML(value=setTitle(dfTarget.iloc[0]))
        display(title)
      with outList:
        ff.value = ff.min
        clear_output()
        ddf  = df[ df.nShared==alpha.value ]
        allHTML = createHTMLBlocks(ddf,ff)
        
        title=widgets.HTML(value="<h4><b>[" + str(np.round(alpha.value/nQuery,2)) + " ] Found " + str(len(ddf)) + "/" + str(nTop.value) + " sentences : </b></h4>"   )
        display(title)
        contSentences = widgets.VBox([w for w in allHTML]) 
        ff.value = ff.max
        display(contSentences)

def on_buttonRunStats_clicked(b):
    df, dfTarget, nQuery = createDataFrame()
    periods   = np.arange(1796,1801)
    table     = createTable(df, nQuery)
    tableYear = createTableYear(df,periods)
    tableWord = createTableWords(df,nInTable.value)
    with outChart:
        clear_output()
        createPicture(df, tableWord)
    out1 = widgets.Output(layout=Layout(width='33%',align_self="center"))
    out2 = widgets.Output(layout=Layout(width='33%',align_self="center"))
    out3 = widgets.Output(layout=Layout(width='33%',align_self="center"))
    with out1:
        display(HTML("<u>Frequency of Overlaps</u>" ))
        display(table)
    with out2:
        display(HTML("<u>Frequency Per Year</u>" ))
        display(tableYear)
    with out3:
        display(HTML("<u>Words Distribution</u>" ))
        display(tableWord)
    
    with outTable:
        clear_output()
        display(widgets.HBox([out1,out2,out3]))


def on_buttonRunSearch_clicked(b):

    with outAlgo:
        clear_output()
        clear_output()
        periods = getCheckBoxesPeriods()
        #doc2VecModules.runSearch(periods, target.value, nTop.value, nPerYear.value,fs, modelWord2Vec, docsD2V, corpusD2V, totSents, idsD2V, forbidden, penalty.value)
        period = [1796]
        wmdModules.eccoWMD(periods, target.value, nTop.value, fs, forbidden)
        
def on_buttonRunLoad_clicked(b):

    global modelWord2Vec, docsD2V, corpusD2V, totSents, idsD2V
    with outTab0:


      with outLoading:
        clear_output()
        display("Starting Search ...")
        try:
            print("Releasing memory...")
            del doc2VecModules.py
            import doc2VecModules
        except:
            import doc2VecModules
        clear_output()
        periods = getCheckBoxesPeriods()
        #display("Loading corpus and model for periods ", periods)
        #period = [1796]
        modelWord2Vec, docsD2V, corpusD2V, totSents, idsD2V = doc2VecModules.loadData(periods, ft)
        display("Done with loading.")

  


In [8]:
#imu.start_watching_memory()

import importlib
#importlib.reload(doc2VecModules)
importlib.reload(wmdModules)

df, dfTarget, nQuery = createDataFrame()
stepSize, box_layout = initialize(nQuery, df)
modelWord2Vec = None
docsD2V = None
corpusD2V = None

# TABS
tabTitles = ['Load Corpus', 'Query Search', 'Analysis', 'Charts and Tables']

# management of output area for list and table
# TAB 0 : Loading
outTab0    = widgets.Output()
outButton  = widgets.Output()
outLoading = widgets.Output()
with outLoading:
    clear_output()
with outButton:
    clear_output()
    paramLoad = createDialogLoading()
    display(paramLoad)
with outTab0:
    tab0 = widgets.VBox([outButton,outLoading])
    display(tab0)

# TAB 1 : Search
outTab1  = widgets.Output()
outQuery = widgets.Output()
outAlgo  = widgets.Output()
with outQuery:
    clear_output()
    paramQuery = createDialogQuery()
    accordionQuery = widgets.Accordion(children=[paramQuery])
    accordionQuery.set_title(0, 'Parameters')
    #accordionQuery.selected_index = None
    display(accordionQuery)
with outAlgo:
    clear_output()
with outTab1:
    tab1 = widgets.VBox([outQuery,outAlgo])
    display(tab1)
# END TAB 1 ===========================================================

# TAB 2 : Analysis
outTab2     = widgets.Output() # overall tab
outTitle    = widgets.Output() # title area
outAnalysis = widgets.Output() # area with parameters setting
outList     = widgets.Output() # area with list of sentences
with outAnalysis:
    createDialogAnalysis()
    accordionAnalysis = widgets.Accordion(children=[paramAnalysis])
    accordionAnalysis.set_title(0, 'Parameters Query Analysis')
    display(accordionAnalysis)
with outTitle:
    clear_output()
with outList:
    clear_output()
with outTab2:
    tab2 = widgets.VBox([outAnalysis,outTitle,outList])
    display(tab2)
# END TAB 2 ===========================================================
 
# TAB 3 : Chart and Table with summary
outTab3   = widgets.Output()
outHeader = widgets.Output()
outTable  = widgets.Output()
outChart  = widgets.Output()
with outHeader:
    paramStats = createDialogStatistics()
    display(paramStats)
with outTable:
    clear_output()
with outChart:
    clear_output()

with outTab3:
    tab3   = widgets.VBox([outHeader,outTable,outChart])
    display(tab3)
# END TAB 3 ===========================================================

# buttons management
#buttonRunLoad.on_click(on_buttonRunLoad_clicked)
buttonRunAnalysis.on_click(on_buttonRunAnalysis_clicked)
buttonRunSearch.on_click(on_buttonRunSearch_clicked)
buttonRunStats.on_click(on_buttonRunStats_clicked)

# the tabs
children = [outTab0, outTab1,outTab2, outTab3]

tabs = widgets.Tab()
tabs.children = children
for i in range(len(children)):
    tabs.set_title(i, tabTitles[i])

allWidgets = widgets.VBox([tabs])
display(allWidgets)

tabs.selected_index = 1 

VBox(children=(Tab(children=(Output(), Output(), Output(), Output()), _titles={'3': 'Charts and Tables', '1': …