[View in Colaboratory](https://colab.research.google.com/github/marcocaserta/ecco/blob/master/eccoModules.ipynb)

In [0]:
# We'll start by creating a directory in which we'll define our new
# module to be imported.
!mkdir -p local_modules/ecco_modules

In [0]:
!cd local_modules

In [15]:
!ls local_modules/ecco_modules/

ecco.py  __init__.py


In [18]:
%%writefile local_modules/ecco_modules/ecco.py
# Save a module init file that contains a custom function that we'll use
# to verify that import works.

from __future__ import print_function
import matplotlib.pyplot as plt
import itertools
from google.colab import files
import numpy as np
import pandas as pd
import io, os
import re, string
import textwrap
import html
#from IPython.display import display, HTML

def createDataFrame(df, dfQuery):
  count             = []
  percent           = []
  dfQuery.tokenized = tknz(dfQuery.tokenized)
  lenQuery        = len(dfQuery.tokenized)
  for i in range(len(df)):
      vect     = tknz(df.iloc[i].tokenized)
      nSent    = len(vect)
      nShared  = sum([w in vect for w in dfQuery.tokenized])
      count.append(nShared)
      percent.append(nShared/lenQuery)
  df["nShared"] = count
  df["percent"] = percent

  return df, dfQuery, lenQuery

# preprocess tokenized senteces
def tknz(sentence):
  sentence = sentence.strip("[,']")
  regex = re.compile(' [%s]' % re.escape(string.punctuation))
  sentence = regex.sub('', sentence)
  vect = sentence.strip(",").split("'")
  final = [w.strip(",") for w in vect]
  return final

def setTitle(dfQuery):
    titleText = "<center><h4>Query : <b>" + dfQuery.sentence.strip("[] '") + "</b> </h4>"
    titleText += "<i>" + str(dfQuery.tokenized) + "</i></center>"
    return titleText

def setTitle2(dfQuery):
  
    titleText = "Query : " + color.BOLD + dfQuery.sentence.strip("[] '") + color.END + "\n"
    titleText += color.ITALIC + str(dfQuery.tokenized) + color.END
    return titleText
  
def formatting(sent,i):
    block= "<li>" + sent.previous.strip("[] '") + " " 
    block+= "<b>" + sent.sentence.strip("[] '") + "</b>" + " "
    block+= sent.next.strip("[] '") + "</li>"
    block+= "<font color='#999999' size='1'>(" + str(i+1) + "-Sentence id = " + str(sent.id)
    block+= "; score = " + str(np.round(sent.score,3))
    block+= "; overlap = " + str(np.round(sent.percent,2))
    block+= "; keywords = " + sent.tokenized + ") </font>"
    return block

def formatting2(sent,i):
  block = color.BULLET + " " + html.unescape(sent.previous.strip("[] '")) + " " 
  block+= color.BOLD + " " + html.unescape(sent.sentence.strip("[] '")) + color.END + " "
  block+= html.unescape(sent.next.strip("[] '")) + "\n"
  block+= color.DIM + "\n(" + str(i+1) + "-Sentence id = " + str(sent.id)
  block+= "; score = " + str(np.round(sent.score,3))
  block+= "; overlap = " + str(np.round(sent.percent,2))
  block+= "; keywords = " + sent.tokenized + ")" + color.END
  return block

def createBlocks(ddf):
  if len(ddf) == 0:
    return []
  ww = []
  for i in range(len(ddf)):
      block = formatting(ddf.iloc[i],i)
      ww.append(block)  
  return ww


def createTable(df, nQuery):
    bins = [i for i in range(nQuery+1)]
    ncounting = []
    for pp in bins:
        ncounting.append(df.loc[df.nShared==pp, 'nShared' ].count())
    table = pd.DataFrame({
        "nr. words" : bins,
        "percent" : [np.round(i/nQuery,3) for i in bins],
        "freq"    : ncounting
    })
    table = table.reindex(['nr. words', 'percent','freq'], axis=1)
    return table
  
def createTableYear(df,periods):
    ncounting = []
    bins = periods
    for yy in bins:
      ncounting.append(df.loc[df.year==yy, 'year'].count())
    tableYear = pd.DataFrame({"year": bins, "freq":ncounting})
    tableYear = tableYear.reindex(['year','freq'], axis=1)
    return tableYear
  
def createTableWords(df, nTop):
    nWords = 10
    vecs = []
    for i in range(len(df)):
      vect = tknz(df.iloc[i].tokenized)
      vecs.append(vect)
    dd = {key:0 for i in range(nTop) for key in vecs[i] }
    for i in range(nTop):
        for w in vecs[i]:
            dd[w] += 1

    words = sorted(dd, key=dd.get, reverse=True)[:nWords]
    vals  = [dd[w] for w in words]
    percent = [np.round(v/nTop, 3) for v in vals]
    dfWords= pd.DataFrame({"word":words,"freq":vals, "percent": percent})
    dfWords = dfWords.reindex(['word', 'percent','freq'], axis=1)
    return dfWords
  
def createChart1(df, lenQuery):
    plt.figure(figsize=(4,4)) 
    bins = [i for i in range(lenQuery+1)]
    ax = plt.gca()
    ax.hist(df.nShared, bins=bins, edgecolor="white", linewidth=1.5, alpha=0.9, align="left")
    ax.set_xticks(bins)
    ax.set_title("Distribution of Sentences over Overlap Values")
    plt.show()

def createChart2(df, lenQuery):
    plt.figure(figsize=(4,4)) 
    ax = plt.gca()
    bins = np.arange(np.min(df.year), np.max(df.year)+2)
    ax.hist(df.year, bins=bins, align="left", edgecolor="white", linewidth=1.5, alpha=0.9)
    ax.set_xticks(bins[:-1])
    ax.set_title("Distribution of Top Sentences over Time Period")

    
def createChart3(dfWords, lenQuery):
    plt.figure(figsize=(4,4)) 
    ax = plt.gca()
    ax.bar(dfWords.word,dfWords.freq, edgecolor="white", linewidth=1.5, alpha=0.9)
    for tick in ax.get_xticklabels():
      tick.set_rotation(60)
    ax.set_title("Distribution of Words among Top Sentences")
    plt.show()


Overwriting local_modules/ecco_modules/ecco.py
