# CHARTERS analyse 
## Instruction:
1. On the left-side panel, change to Files tab 
2. Upload a charter file
3. Right click on the uploaded file -> Copy path
2. Enter (paste) its path/name into the 'Read doc' form below
3. top menu: Runtime/run all

requires https://github.com/nemoware/document-parser/releases

In [0]:
_git_branch = 'charters-limitations'


lib_version = '1.1.15' #Document parser, refer https://github.com/nemoware/document-parser/releases

# INIT

## pull code

In [0]:
import json
import subprocess
import sys
import time

import numpy as np
from IPython.core.display import display, HTML
from google.colab import files

!pip install overrides

Ничто = None



def exec(x):
  r = subprocess.check_output(x, shell=True)
  r = r.decode('unicode-escape').encode('latin1').decode('utf8')
  print(r)


print(f"fetching code from GitHub.....{_git_branch}")
try:
  exec('rm -r nlp_tools')
except:
  pass
exec(f'git clone --single-branch --branch {_git_branch} https://github.com/nemoware/analyser.git nlp_tools')

print('🦊 GIT revision:')
exec('cd nlp_tools\ngit rev-list --reverse HEAD | awk "{ print NR }" | tail -n 1\ngit branch\ngit log -3 --pretty=%B')

sys.path.insert(0, 'nlp_tools')

print('❤️importing Code from GitHub ... DONE')


#----
import matplotlib as mpl
from analyser.documents import TextMap
from analyser.legal_docs import DocumentJson
from colab_support.renderer import HtmlRenderer

 

class DemoRenderer(HtmlRenderer):
  def render_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    html = self.to_color_text(tokens, weights, colormap, print_debug, _range, separator=separator)
    display(HTML(html))

  def to_color_text(self, tokens, weights, colormap='coolwarm', print_debug=False, _range=None, separator=' '):
    return super()._to_color_text(tokens, weights, mpl, colormap=colormap, _range=_range, separator=separator)

   
renderer_ = DemoRenderer()

def print_json_summary(cd:DocumentJson):
  wordsmap = TextMap(cd.normal_text, cd.tokenization_maps['$words'])
  print(f'read file {cd.filename}')

  for tag in cd.tags:
    span = tag.span
    _map = cd.tokenization_maps[tag.span_map]
    print(tag)
 
  

In [0]:
!pip install pyjarowinkler

### Init document-parser lib

In [0]:

import os
if not os.path.isfile(f'document-parser-{lib_version}-distribution.zip'):
  !wget https://github.com/nemoware/document-parser/releases/download/$lib_version/document-parser-$lib_version-distribution.zip
if not os.path.isdir(f'document-parser-{lib_version}'):
  !unzip document-parser-$lib_version-distribution.zip

 
os.environ ['documentparser']=f'/content/document-parser-{lib_version}'
from integration.word_document_parser import WordDocParser, join_paragraphs
wp = WordDocParser()

### imports

In [0]:
import os
import pickle
import unittest

import numpy as np

from analyser.charter_parser import CharterParser 
from analyser.parsing import AuditContext

from analyser.legal_docs import LegalDocument
from analyser.ml_tools import *
from analyser.text_tools import *

from tf_support.embedder_elmo import ElmoEmbedder
 

In [0]:
from tf_support.embedder_elmo import ElmoEmbedder
elmo_embedder = ElmoEmbedder()
elmo_embedder_default = ElmoEmbedder(layer_name='default')

## 💅 Init Embedder(s)

In [0]:
charter_analyser = CharterParser(elmo_embedder, elmo_embedder_default)

# Read doc

In [0]:
#@title Enter uploaded file path

filename = '/content/\u0423\u0441\u0442\u0430\u0432 - \u0413\u041F\u041D-\u0422\u0440\u0430\u043D\u0441\u043F\u043E\u0440\u0442_\u0413\u041E\u0421\u0410-2018.docx' #@param {type:"string"}
subsidiary_name = '--\u041E\u043F\u0446\u0438\u043E\u043D\u0430\u043B\u044C\u043D\u043E--' #@param {type:"string"}


results = wp.read_doc(filename)
for doc in results['documents'][:1]:  # XXX
  if 'CHARTER' == doc['documentType']:    
    doc = join_paragraphs(doc, 'no_id')


for p in doc.paragraphs:
  print ('☢️', p.header.value.strip())

# 🧠 Analyse PHASE 0

In [0]:
# =====================================
from analyser.parsing import AuditContext
actx = AuditContext()
charter_analyser.find_org_date_number(doc, actx)

### render PHASE 0 results 
(note, doc trimmed to 300 words)

In [0]:
for t in doc.get_tags():
  print(t)
print('\n\n')

renderer_.render_color_text(doc.tokens[0:300],  doc.get_tags_attention()[0:300] )



# 🧠 Analyse PHASE 1 
(requires phase 0)

In [0]:
charter_analyser.find_attributes(doc, actx)

### render PHASE 1 results

#### Charter limitations

In [0]:
_P = '\033[1;31m'
ta=doc.get_tags_attention()
for t in doc.constraint_tags + doc.charity_tags:
  print('>'*100)
  print(t.get_key())
  print(f'☢{_P} {t.confidence:1.4f}', t.value.display_string.upper())
  
  len_ = t.span[1]-t.span[0]
  if len_ < 300:
    renderer_.render_color_text(doc.tokens[t.as_slice()], ta[[t.as_slice()]]*0.05 +  [t.confidence]*len_  , _range=(0.66, 1) )
  
  print('^'*100)

In [0]:

for t in doc.get_tags():
  print('☢️','>'*100)
  print(t)
  # print(doc.substr(t))
  if t.span[1]-t.span[0] < 300:
    renderer_.render_color_text(doc.tokens[t.as_slice()],  ta[t.as_slice()], _range=(0.1,5) )
  
  print('^'*100)
print('\n\n')


In [0]:
renderer_.render_color_text(doc.tokens,  ta )

# save to JSON

In [0]:
fn =  f'{filename}.json'
print('saving JSON to', fn)

with open(fn, 'w') as file:
  jjj = DocumentJson(doc)
  file.write(jjj.dumps())
  

In [0]:
doc.to_json_obj()['warnings']

In [0]:
doc.to_json_obj()['attributes']

# Debug

In [0]:
from analyser.structures import OrgStructuralLevel, CharterSubject
from analyser.contract_parser import _find_most_relevant_paragraph, find_value_sign_currency_attention
from analyser.charter_parser import get_charter_subj_attentions, collect_subjects_spans
org_levels = [k for k in OrgStructuralLevel]
# del charter_subjects
# charter_subjects = [k for k in CharterSubject]

sample_org_level_tag = None
for t in doc.get_tags():
  if t.value in org_levels:
    print(t.value)
    sample_org_level_tag = t
    # break

subdoc = doc[sample_org_level_tag.as_slice() ]
subject_attentions_map = get_charter_subj_attentions(subdoc, charter_analyser.subj_patterns_embeddings)



subject_spans = collect_subjects_spans(subdoc, subject_attentions_map)
# renderer_.render_color_text(subdoc.tokens,  paragraph_attention_vector )


In [0]:
constraint_tags, subject_attentions_map = charter_analyser.attribute_spans_to_subjects(subject_spans, subdoc, sample_org_level_tag)
constraint_tags

for t in constraint_tags:
  print('⚡️')
  print(t)
  print(doc.substr(t))

In [0]:
from analyser.legal_docs import ContractValue, LegalDocumentExt
from analyser.charter_parser import collect_sentences_having_constraint_values


contract_values: [ContractValue] = find_value_sign_currency_attention(subdoc, None)
valued_sentence_spans = collect_sentences_having_constraint_values(subdoc, contract_values, merge_spans=True)
valued_sentence_spans 

In [0]:
for sp in valued_sentence_spans:
  print('-'*100)
  print(subdoc.tokens_map.text_range(sp ))


In [0]:
united_spans = []
for c in valued_sentence_spans:
  united_spans.append(c)
for c in subject_spans:
  united_spans.append(c)

united_spans = merge_colliding_spans(united_spans)
united_spans


constraint_tags, subject_attentions_map = charter_analyser.attribute_spans_to_subjects(united_spans, subdoc, sample_org_level_tag)
constraint_tags

for t in constraint_tags:
  print('⚡️')
  print(t)
  print(doc.substr(t))