## I. Setup, Installations, and Imports

#### (a) Installations

In [None]:
! pip install spacy
! pip install nltk
! python -m spacy download en_core_web_sm
! pip install svgling

#### (b) Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
import glob
import tqdm

#### (c) Downloads

In [None]:
nltk.download('words')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('state_union')

## II. Defined Functions Used In Program


#### (a) Named Entity Recognizer Function
**Input:** A sentence <br> 
**Output:** A list of the named entity recognizers of that sentence

In [2]:
def ner(sentence): 
    
    words = nltk.word_tokenize(sentence) #break down the sentence into words
    tagged = nltk.pos_tag(words) #tag the words with Part of Speech 
    chunks = nltk.ne_chunk(tagged, binary=False) #binary = False named entities are classified (i.e PERSON, ORGANIZATION)
    
    entities = []
    
    for chunk in chunks:
        if hasattr(chunk, 'label'): #hasattr(obj, key) -- checking if chunks have a label or not 
              entities.append(' '.join(c[0] for c in chunk)) #append entities to array
    
    return entities

#### (b) Filename Traversal Function
**Input:** A Filename **(i.e /inputs/ex10.txt)** <br>
**Output:** A list of all the sentence, entities pair in the text file **(i.e [["sentence1", [listOfentities1]], ["sentence2", [listOfentities2]]])**

In [3]:
all_combined_list = []

def doc_trawl(filename):
    
    #Re-instantiate the main list
    all_combined_list = []
    
    #Open file, get raw text file, get raw sentences
    fp = open(filename, "r")
    raw_text_file = fp.read() 
    raw_sentences = nltk.sent_tokenize(raw_text_file)
    
    #Traverse through sentences
    for sentence in raw_sentences:
        
        #Get the entities of the sentence using the NER function above 
        entities = ner(sentence)
        
        #Combine the sentence and entities into one array (get the final value for the dict)
        combined = [sentence, entities] 
        
        #Append to final, combined list
        all_combined_list.append(combined)
        
    return all_combined_list

## III. Automation

In [4]:
final_dict = {}
files = glob.glob("inputs/*") #get all the files in the inputs folder

for file in files:
    final_dict.update({file: doc_trawl(file)}) #update the dictionary 

In [5]:
final_dict

{'inputs/ex10.txt': [['<DOCUMENT>\n<TYPE>EX-10\n<SEQUENCE>4\n<FILENAME>ex10.txt\n<DESCRIPTION>CFC INTERNATIONAL, INC.-BAXTER PURCHASE AGREEMENT\n<TEXT>\nExhibit 10.9\n\n\n                               PURCHASE AGREEMENT\n\n         This Agreement, effective March 1, 2001 is between CFC International, a\nDelaware corporation, with offices at 500 State Street, Chicago Heights,\nIllinois 60411 ("Seller") and Baxter Healthcare Corporation, a Delaware\ncorporation, with offices at One Baxter Parkway, Deerfield, Illinois 60015 on\nbehalf or its self and its affiliates (entities controlling, controlled by, or\nunder common control with Baxter)("Buyer").',
   ['DOCUMENT',
    'FILENAME',
    'CFC',
    'PURCHASE',
    'CFC International',
    'Delaware',
    'State Street',
    'Chicago Heights',
    'Illinois',
    'Seller',
    'Baxter Healthcare Corporation',
    'Delaware',
    'One Baxter Parkway',
    'Deerfield',
    'Illinois',
    'Baxter']],
  ['1.0 Background\n\n\n         1.1 Sell