# Notebook setup

In [1]:
import nltk
from nltk import sent_tokenize, word_tokenize
import os
import string
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('punkt')
nltk.download('vader_lexicon');

[nltk_data] Downloading package punkt to /Users/zkovacs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zkovacs/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Disable output to reduce execution time.
output = False

In [3]:
outPath = "../training_set/ocr_output/"

for (dirpath, dirnames, filenames) in os.walk(outPath):
    break

if '.DS_Store' in filenames :
    filenames.remove('.DS_Store')

# Features

## Useful functions

In [4]:
def readFile(filename):
    f = open(outPath+filename, 'r', encoding="cp1252") #for MAC ?
    rawText = f.read()
    text = rawText.replace("\n\n", "%EOL%").replace("\n"," ").replace("%EOL%","\n")
    return text


def removePunctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


def findWithKeywords(text, anyKeywords=[], allKeywords=[], excludedKeywords=[]):
    text = text.replace("\n\n", "%EOL%").replace("\n"," ").replace("%EOL%","\n")
    sentences = sent_tokenize(text)
    matched = []
    for sentence in sentences:
        if len(anyKeywords) > 0 and not any(keyword in sentence.lower() for keyword in anyKeywords):
            continue
        if len(allKeywords) and not all(keyword in sentence.lower() for keyword in allKeywords):
            continue
        if not any(keyword in sentence.lower() for keyword in excludedKeywords):
            matched.append(sentence)

    return "\n\n".join(matched)


def findWithKeywordsSentenceWindow(text, anyKeywords=[], allKeywords=[], excludedKeywords=[], windowSize=1):
    text = text.replace("\n\n", "%EOL%").replace("\n"," ").replace("%EOL%","\n")
    sentences = sent_tokenize(text)
    matched = []
    
    for index in range(0, len(sentences) - windowSize):
        sentence = sentences[index] + '\n\n' + sentences[index + 1]
        if len(anyKeywords) > 0 and not any(keyword in sentence.lower() for keyword in anyKeywords):
            continue
        if len(allKeywords) and not all(keyword in sentence.lower() for keyword in allKeywords):
            continue
        if not any(keyword in sentence.lower() for keyword in excludedKeywords):
            matched.append(sentence)

    return "\n\n".join(matched)


def findSentencesWithAnyKeywords(text, keywords, excludedKeywords=[]):
    return findWithKeywords(text, anyKeywords=keywords, excludedKeywords=excludedKeywords)


def findSentencesWithAllKeywords(text, keywords, excludedKeywords=[]):
    return findWithKeywords(text, allKeywords=keywords, excludedKeywords=excludedKeywords)

def findDirectorNumberText(text):
    return findSentencesWithAllKeywords(text,["number of directors"], ["chair", "vacancy", "vacancies", "quorum"])

def findFirstNumberAfterWord(text, paramWord=""):
    numWords = [
        "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen", "twenty"]
    listWords = word_tokenize(text)
    for word in listWords[listWords.index(paramWord):]:
        word = removePunctuation(word)
        if word in numWords:
            return numWords.index(word)
        if word.isdigit():
            return word
    return ""

## Is the company empowered to borrow?

In [5]:
def findCanBorrowText(text):
    return (
        findSentencesWithAnyKeywords(text, ["any business", "issue debt", "indebtedness"])
        + " "
        + findWithKeywords(text, anyKeywords=["borrow", "raise"], allKeywords=["money"])
        )


def canBorrow(text):
    canBorrowText = findCanBorrowText(text)
    if canBorrowText.strip() == "":
        return "no"
    return getSentiment(canBorrowText)


def getSentiment(text):
    if text.strip() == "":
        return ""
    sentimentAnalyzer = SentimentIntensityAnalyzer()
    scores = sentimentAnalyzer.polarity_scores(text)
    aggregated_score = scores["compound"]
    return "yes" if aggregated_score > 0 else "no"


for filename in filenames:
    text = readFile(filename)
    if output:
        print(filename)
        print(canBorrow(text))
        print("\n")

## What is the size of the board of directors? Minimum and maximum.

In [6]:
def findMinDirectors(fullText):
    directorText = findDirectorNumberText(fullText)
    if "no minimum" in directorText:
        return "noMin"            
    if "minimum" in directorText:
        return findFirstNumberAfterWord(directorText, "minimum")
    if "less" in directorText: # for cases of "not less than" and "shall not be less than"
        return findFirstNumberAfterWord(directorText, "less")
    return "1"

def findMaxDirectors(fullText):
    directorText = findDirectorNumberText(fullText)
    if "no maximum" in directorText:
        return "noMax"            
    if "maximum" in directorText:
        return findFirstNumberAfterWord(directorText, "maximum")
    if "more" in directorText: # for cases of "not more than" and "shall not be more than"
        return findFirstNumberAfterWord(directorText, "more")
    return "noMax" # TODO: Use noMax if ran out of ideas

for filename in filenames:
    text = readFile(filename)
    if output:
        print(filename)
        print(findDirectorNumberText(text))
        print(findMinDirectors(text))
        print(findMaxDirectors(text))
        print("\n")

## Are the directors empowered to borrow?

In [7]:
def findDirectorsCanBorrowText(text):
    return (
        findWithKeywords(text, anyKeywords=["borrow", "debt", "incur", "indebtedness"], allKeywords=["directors may"])
        + " " 
        + findWithKeywords(text, anyKeywords=["borrow", "debt", "incur", "indebtedness"], allKeywords=["directors can"])
        )

def findBoardCanBorrowText(text):
    return findWithKeywords(text, anyKeywords=["borrow", "debt", "incur", "indebtedness"], allKeywords=["the board may"])

def canDirectorsBorrow(text):
    directorsText = findDirectorsCanBorrowText(text)
    if directorsText.strip() != "":
        return getSentiment(directorsText)
    boardText = findBoardCanBorrowText(text)
    if boardText.strip() != "":
        return "no"
    return "yes"

## Is a resolution of directors required to borrow?

In [8]:
def resolutionNeeded(text):
    directorsText = findDirectorsCanBorrowText(text);
    if canDirectorsBorrow(directorsText):
        if "resolution" in directorsText.lower():        
            return "yes"
        else:
            return "no"
    else:
        return "no"

for filename in filenames:
    text = readFile(filename)
    if output:
        print(filename)
        print(findDirectorsCanBorrowText(text))
        print(canDirectorsBorrow(text))
        print(resolutionNeeded(text))
        print("\n")

## What is the quorum for such a resolution?

In [9]:
def findQuorumText(text, keywords=["quorum", "number"]):
    return findWithKeywordsSentenceWindow(text, allKeywords=keywords, anyKeywords=["directors", "shareholders"], windowSize=2)

def findQuorum(fullText):
    quorumText = findQuorumText(fullText)
    if quorumText.strip() == "":
        quorumText = findQuorumText(text, keywords=["quorum", "meeting"])
    match = re.search(r'not less than (.*?) of the', quorumText)
    if match:
        matched = match.group(1)
        return matched.translate(str.maketrans('-—','  '))
    else:
        return "2"
    
for filename in filenames:
    text = readFile(filename)
    if output:
        print(filename)
        print(findQuorumText(text))
        print("quorum : " + findQuorum(text))
        print("\n")