## I. Setup, Installations, and Imports

#### (a) Installations

Run these if not on your computer already.

In [1]:
# ! pip install spacy
# ! pip install nltk
# ! python -m spacy download en_core_web_sm
# ! pip install svgling

#### (b) Imports

In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import glob
from tqdm import tqdm
from bs4 import BeautifulSoup

#### (c) Downloads

Run these if not on your computer already.

In [3]:
# nltk.download('words')
# nltk.download('punkt')
# nltk.download('maxent_ne_chunker')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('state_union')

## II. Defined Functions Used In Program


#### (a) Named Entity Recognizer Function
**Input:** A sentence <br> 
**Output:** A list of the named entity recognizers of that sentence

In [4]:
# here is some temp code: i'm trying to figure out how to find/save the verb
# sentence = '13.7 In the event that a court of competent jurisdiction holds that\nparticular provisions or requirements of this Agreement are in violation of any\nlaw, such provisions or requirements shall be enforced and shall remain in full\nforce and effect to the extent they are not in violation of any such law or not\notherwise unenforceable, and all other provisions and requirements of this\nAgreement shall remain in full force and effect.'
# sentence = '13.6 This Agreement is deemed to have been entered into in the State of\nIllinois and its interpretations, construction, and the remedies for its\nenforcement of breach are to be applied pursuant to and in accordance with the\nlaws of the State of Illinois. 	'
sentence = '\nEX-10\n2\nex10-11.txt\nCFC INTERNATIONAL, INC. - CONTRACT ADDENDUM\n\n ADDENDUM TO\n PURCHASE AGREEMENT - DATED MARCH 1, 2001\n\nThis Agreement (the "Addendum"), effective October 15, 2002, between CFC\nInternational, a Delaware corporation, ("CFC"), and Baxter Healthcare\nCorporation, a Delaware corporation, and its successors, affiliates and assigns\n("Baxter"), amends the Purchase Agreement ("Agreement") between the two\ncompanies dated March 1, 2001.'
words = nltk.word_tokenize(sentence)         #break down the sentence into words
tagged = nltk.pos_tag(words)                 #tag the words with Part of Speech 
chunks = nltk.ne_chunk(tagged, binary=False) #binary = False named entities are classified (i.e PERSON, ORGANIZATION)

# todo experiement here to get verb and subject, once done, implement in function below

In [5]:
def ner(sentence): 
    
    words  = nltk.word_tokenize(sentence)        # break down the sentence into words
    tagged = nltk.pos_tag(words)                 # tag the words with Part of Speech 
    chunks = nltk.ne_chunk(tagged, binary=False) # binary = False named entities are classified (i.e PERSON, ORGANIZATION)
    
    entities = []
    
    for chunk in chunks:
        if hasattr(chunk, 'label'): # hasattr(obj, key) -- checking if chunks have a label or not 
            entities.append(' '.join(c[0] for c in chunk)) # append entities to array
    
    # todo add code here as needed to get the verb and subject, (if you get them via looping over chunks, then do within the for loop above)
        
    return {'entities':entities, 
           'random_out':np.random.uniform()  }  # todo update the output dictionary to output the verb and subject (and delete the placeholder "random" output)

#### (b) Filename Traversal Function
**Input:** A Filename **(i.e /inputs/ex10.txt)** <br>
**Output:** A list of all the sentence, entities pair in the text file **(i.e [["sentence1", [listOfentities1]], ["sentence2", [listOfentities2]]])**

In [6]:
def doc_trawl(filename):
    '''
    Output is dict of dicts:
    
         {sentence: {sentence_level_outputs}}
    
    where sentence_level_outputs 
    
        {'analysis type/function' : output thereof}
    '''
    
    file_output = {}
    
    with open(filename, "r") as fp:
        raw = BeautifulSoup(fp.read(), 'html.parser').get_text()
        raw_sentences = nltk.sent_tokenize(raw)
    
    for sentence in raw_sentences:
        
        # put all output of this sentence here 
        # key=analysis type/function, value=output thereof
        sentence_level_outputs = {} 
        
        # use ner function  
        sentence_level_outputs.update(ner(sentence))
        
        # any other output we want to add that doesn't rely on the ner tokenization
        # should be done here
        # to show that the plumbing works correctly, let's add variable 2:
        sentence_level_outputs['random_num'] = np.random.uniform()
        
        # Add to output dictionary
        file_output.update({sentence:sentence_level_outputs})
        
    return file_output

## III. Automation

In [7]:
file_sentence_dict = {}
files = glob.glob("inputs/*") #get all the files in the inputs folder

for file in tqdm(files,total=len(files)):
    file_sentence_dict.update({file: doc_trawl(file)}) #update the dictionary 

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.05it/s]


## IV. Unpacking that into DF

Dataframe with 
- index is filename-sentence
- columns are sentence level variables

Now we can do diagnostics, examine the output, and use it faster!

In [8]:
def unpack_tri_level_dict(a_dict):
    df = pd.concat(map(lambda x: pd.DataFrame.from_dict(x).T, a_dict.values()), keys=a_dict.keys())
    df.index = df.index.rename(['file','sentence'])
    return df

unpack_tri_level_dict(file_sentence_dict)


Unnamed: 0_level_0,Unnamed: 1_level_0,entities,random_out,random_num
file,sentence,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
inputs\ex10-11.txt,"\nEX-10\n2\nex10-11.txt\nCFC INTERNATIONAL, INC. - CONTRACT ADDENDUM\n\n ADDENDUM TO\n PURCHASE AGREEMENT - DATED MARCH 1, 2001\n\nThis Agreement (the ""Addendum""), effective October 15, 2002, between CFC\nInternational, a Delaware corporation, (""CFC""), and Baxter Healthcare\nCorporation, a Delaware corporation, and its successors, affiliates and assigns\n(""Baxter""), amends the Purchase Agreement (""Agreement"") between the two\ncompanies dated March 1, 2001.","[CFC, INC., CONTRACT, AGREEMENT, DATED, CFC In...",0.240199,0.598029
inputs\ex10-11.txt,1.,[],0.5136,0.043709
inputs\ex10-11.txt,"General Provisions\n----------------------\n\nAll ""terms and conditions"" of the original Agreement will remain effective as\nstated in the Agreement with only the specific revisions as stated below.",[],0.629221,0.963168
inputs\ex10-11.txt,This\naddendum applies to CFC products B10EK black and B5603AB black.,"[CFC, B10EK, B5603AB]",0.599191,0.369298
inputs\ex10-11.txt,"2.0 Distribution\n-----------------\n\nBuyer agrees to purchase foil requirements for current users, which are wholly\nowned subsidiaries of Baxter Healthcare Corporation.","[Buyer, Baxter Healthcare Corporation]",0.493433,0.015042
...,...,...,...,...
inputs\ex10.txt,"13.6 This Agreement is deemed to have been entered into in the State of\nIllinois and its interpretations, construction, and the remedies for its\nenforcement of breach are to be applied pursuant to and in accordance with the\nlaws of the State of Illinois.","[Illinois, Illinois]",0.462404,0.107635
inputs\ex10.txt,"13.7 In the event that a court of competent jurisdiction holds that\nparticular provisions or requirements of this Agreement are in violation of any\nlaw, such provisions or requirements shall be enforced and shall remain in full\nforce and effect to the extent they are not in violation of any such law or not\notherwise unenforceable, and all other provisions and requirements of this\nAgreement shall remain in full force and effect.",[],0.246763,0.471328
inputs\ex10.txt,"In Witness Whereof, the parties have caused this Agreement to be executed by\ntheir authorized representatives.",[Witness Whereof],0.356621,0.149796
inputs\ex10.txt,BAXTER HEALTHCARE CORP CFC INTERNATIONAL\n\n\nBy:_____________________________ By:_____________________________\n Dave Valentini Robert E. Jurgens\nTitle: V.P.,"[BAXTER, HEALTHCARE, CORP, CFC, Dave Valentini...",0.924464,0.925296
