# PROTOCOLS analyse 
## Instruction:
1. On the left-side panel, change to Files tab 
2. Upload a protocol file
3. Right click on the uploaded file -> copy path
2. Enter (paste) its path/name into the 'Read doc' form below
3. top menu: Runtime/run all

In [0]:
_git_branch = 'charters-subjects-2'

#Document parser, refer https://github.com/nemoware/document-parser/releases
lib_version = '1.1.18'


# INIT

## pull code

In [0]:
import json
import subprocess
import sys
import time

import numpy as np
from IPython.core.display import display, HTML
from google.colab import files

!pip install overrides
!pip install pyjarowinkler

Ничто = None



def exec(x):
  r = subprocess.check_output(x, shell=True)
  r = r.decode('unicode-escape').encode('latin1').decode('utf8')
  print(r)


print(f"fetching code from GitHub.....{_git_branch}")
try:
  exec('rm -r nlp_tools')
except:
  pass
exec(f'git clone --single-branch --branch {_git_branch} https://github.com/nemoware/analyser.git nlp_tools')

print('🦊 GIT revision:')
exec('cd nlp_tools\ngit rev-list --reverse HEAD | awk "{ print NR }" | tail -n 1\ngit branch\ngit log -3 --pretty=%B')

sys.path.insert(0, 'nlp_tools')

print('❤️importing Code from GitHub ... DONE')


#----
import matplotlib as mpl
from analyser.documents import TextMap
from analyser.legal_docs import DocumentJson
from colab_support.renderer import HtmlRenderer

 

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, mpl, colormap=colormap, _range=_range, separator=separator)

   
renderer_ = DemoRenderer()

def print_json_summary(cd:DocumentJson):
  wordsmap = TextMap(cd.normal_text, cd.tokenization_maps['$words'])
  print(f'read file {cd.filename}')

  for tag in cd.tags:
    span = tag.span
    _map = cd.tokenization_maps[tag.span_map]
    print(tag)
 
  

In [0]:
!pip install pyjarowinkler

### Init document-parser lib

In [0]:

import os
if not os.path.isfile(f'document-parser-{lib_version}-distribution.zip'):
  !wget https://github.com/nemoware/document-parser/releases/download/$lib_version/document-parser-$lib_version-distribution.zip
if not os.path.isdir(f'document-parser-{lib_version}'):
  !unzip document-parser-$lib_version-distribution.zip

 
os.environ ['documentparser']=f'/content/document-parser-{lib_version}'
from integration.word_document_parser import WordDocParser, join_paragraphs
wp = WordDocParser()

### imports

In [0]:
import os
import pickle
import unittest

import numpy as np

from analyser.contract_parser import ContractAnlysingContext, ContractDocument
from analyser.contract_patterns import ContractPatternFactory
from analyser.legal_docs import LegalDocument
 
from analyser.ml_tools import *

# from headers_detector import doc_features, load_model, make_headline_attention_vector
from analyser.hyperparams import HyperParameters
from analyser.protocol_parser import protocol_votes_re
 

## 💅 Init Embedder(s)

In [0]:
# from protocol_parser import  ProtocolPatternFactory
from tf_support.embedder_elmo import ElmoEmbedder
# from contract_patterns import ContractPatternFactory
elmo_embedder = ElmoEmbedder()
elmo_embedder_default = ElmoEmbedder(layer_name="default")

# protocols_factory = ProtocolPatternFactory(elmo_embedder)
# contracts_factory = ContractPatternFactory(elmo_embedder)

In [0]:
# from analyser.contract_parser import find_value_sign_currency_attention
from analyser.legal_docs import tokenize_doc_into_sentences_map, ContractValue
from analyser.ml_tools import *
from analyser.parsing import ParsingContext
from analyser.patterns import *
from analyser.protocol_parser import ProtocolDocument, find_confident_spans, protocol_votes_re, ProtocolPatternFactory
from analyser.protocol_parser import  find_org_structural_level, find_protocol_org, ProtocolParser
from analyser.text_tools import *

# legal_docs.py
from tf_support.embedder_elmo import ElmoEmbedder
 



In [0]:
protocol_analyser = ProtocolParser(elmo_embedder, elmo_embedder_default)

# Read doc

In [0]:
#@title Enter uploaded file path

filename = '/content/6. \u041F\u0440\u043E\u0442\u043E\u043A\u043E\u043B_\u041D\u0435\u0434\u0432\u0438\u0436\u0438\u043C\u043E\u0441\u0442\u044C.docx' #@param {type:"string"}


results = wp.read_doc(filename)
for doc in results['documents'][:1]:  # XXX
  if 'PROTOCOL' == doc['documentType']:    
    doc = join_paragraphs(doc, 'no_id')


for p in doc.paragraphs:
  print ('☢️', p.header.value.strip())

# 🧠 Analyse PHASE 0

In [0]:
# =====================================
from analyser.parsing import AuditContext
actx = AuditContext()
protocol_analyser.find_org_date_number(doc, actx)

### render PHASE 0 results

In [0]:
for t in doc.get_tags():
  print(t)
renderer_.render_color_text(doc.tokens,  doc.get_tags_attention() )



# 🧠Analyse PHASE 1 
(requires phase 0)

In [0]:
protocol_analyser.find_attributes(doc, actx)

### render PHASE 1 results

In [0]:
for t in doc.get_tags():
  print(t)
renderer_.render_color_text(doc.tokens,  doc.get_tags_attention() )

# save to JSON

In [0]:

fn =  f'{filename}.json'
print('saving JSON to', fn)

with open(fn, 'w') as file:
  jjj = DocumentJson(doc)
  file.write(jjj.dumps())
  

In [0]:
doc.to_json_obj()['attributes']

In [0]:
raise('STOP HERE')

# Debug

## Sections attention

In [0]:


protocol_sections_edges = protocol_analyser.find_protocol_sections_edges(doc.distances_per_sentence_pattern_dict)
renderer_.render_color_text(doc.sentence_map.tokens,  protocol_sections_edges , _range=[0,1], separator='¶<br>')


### sections spans

In [0]:
from analyser.documents import sentences_attention_to_words
from analyser.dates import   document_number_c
from analyser.contract_agents import complete_re as agents_re
from analyser.transaction_values import complete_re as values_re


### AV

In [0]:

#DEAL APPROVAL SENTENCES
v_deal_approval = max_exclusive_pattern_by_prefix(doc.distances_per_sentence_pattern_dict, 'deal_approval_')
_spans, deal_approval_av = sentences_attention_to_words(v_deal_approval, doc.sentence_map,
                                                                        doc.tokens_map)
deal_approval_relu_av = best_above(deal_approval_av, 0.5)

# VOTES
votes_av = doc.tokens_map.regex_attention(protocol_votes_re)
# DOC NUMBERS
numbers_av = doc.tokens_map.regex_attention(document_number_c)
# DOC AGENTS orgs
agents_av = doc.tokens_map.regex_attention(agents_re)

# DOC MARGIN VALUES
margin_values_av = protocol_analyser._get_value_attention_vector(doc)
margin_values_v = doc.tokens_map.regex_attention(values_re)
margin_values_v*=margin_values_av


renderer_.render_color_text(doc.tokens,  deal_approval_relu_av , _range=[0,1])

## Combined attention

In [0]:
combined_av = sum_probabilities([deal_approval_relu_av,
                                 margin_values_v, 
                                 agents_av/2,
                                 votes_av/2, 
                                 numbers_av/2])


combined_av_norm = combined_av = best_above(combined_av, 0.2) 
renderer_.render_color_text(doc.tokens,  combined_av_norm , _range=[0,1])

In [0]:
# v_sections_attention = find_protocol_sections_edges(protocol_analyser, doc.distances_per_sentence_pattern_dict)

_question_spans_sent = spans_between_non_zero_attention(protocol_sections_edges)
question_spans_words = doc.sentence_map.remap_slices(_question_spans_sent, doc.tokens_map)
agenda_questions = list(find_confident_spans(question_spans_words, combined_av_norm, 'agenda_item', 0.5))

for x in agenda_questions:
  print("="*100)
  print(x)
  print(doc.substr(x))

for span in question_spans_words:
  print("="*100)
  sl=slice(span[0],span[1])
  renderer_.render_color_text(doc[sl].tokens,  combined_av_norm[sl] , _range=[0,1])

In [0]:
raise ('stop here please for now')

## Debug votes finder

In [0]:
from analyser.protocol_parser import ProtocolAV
renderer_.render_color_text(doc.tokens,  doc.distances_per_pattern_dict[ProtocolAV.bin_votes_attention.name] , _range=[0,1])


In [0]:

renderer_.render_color_text(doc.tokens, numbers_av+votes_av, _range=[0,2])


### debug protocol_votes_re

In [0]:
x = protocol_votes_re.search(doc.text)

match = doc.text[x.span()[0]:x.span()[1]]
print(f'[{match}]')

### debug spans_having_votes_words

In [0]:
v_sections_attention = protocol_analyser.find_protocol_sections_edges(doc.distances_per_sentence_pattern_dict)
question_spans_sent = spans_between_non_zero_attention(v_sections_attention)
question_spans_words = doc.sentence_map.remap_slices(question_spans_sent, doc.tokens_map)

for c in question_spans_words:
  print('-'*80)
  print (c, doc.tokens_map.text_range(c))