## Prereqs

1. You have jupyterlab running in your base conda environment. 
1. Create the `nlp_stanza` environment via the readme's instructions. 
1. Add that environment's python kernel to jupyterlab via the following code. This will also download the `spacy` model. 
    ```
    $ conda activate nlp_stanza          
    (nlp_stanza)$ conda install ipykernel
    (nlp_stanza)$ python -m ipykernel install --user --name=nlp_stanza --display-name "NLP Stanza Env"
    (nlp_stanza)$ python -m spacy download en_core_web_sm
    (nlp_stanza)$ conda deactivate
    $ jupyter lab          
    ```
1. Make sure this script is being run with the nlp_stanza conda environment's kernel. In the upper right corner, click the kernel name and change to "NLP Stanza Env".


In [7]:
# in terminaL

# > conda activate nlp_stanza
# > 

In [8]:
import stanza
# stanza.download('en') # This downloads the English models for the neural pipelin
stanza_nlp = stanza.Pipeline('en', download_method=None) # This sets up a default neural pipeline in English

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

import glob
from tqdm import tqdm
from bs4 import BeautifulSoup
from __future__ import unicode_literals
import spacy

spacy_nlp = spacy.load("en_core_web_sm",)  

nltk.download('words')
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('averaged_perceptron_tagger')
nltk.download('state_union')

2023-03-30 14:28:08 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

2023-03-30 14:28:08 INFO: Using device: cpu
2023-03-30 14:28:08 INFO: Loading: tokenize
2023-03-30 14:28:08 INFO: Loading: pos
2023-03-30 14:28:08 INFO: Loading: lemma
2023-03-30 14:28:09 INFO: Loading: constituency
2023-03-30 14:28:10 INFO: Loading: depparse
2023-03-30 14:28:10 INFO: Loading: sentiment
2023-03-30 14:28:11 INFO: Loading: ner
2023-03-30 14:28:13 INFO: Done loading processors!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DonsLaptop\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DonsLaptop\AppData\Roaming\nltk_data...
[nltk

True

### One sentence test

In [9]:
# sentence = "Seller produces hot stamping foil which conforms and meets the Specification Requirements submitted, accepted and in Seller's possession for the Specification numbers listed attached in the Exhibit A., hereafter referred nto as 'Products'."

# doc = stanza_nlp(sentence)

# subj_verb_linkages = []

# for word in doc.sentences[0].words:    # for each word in the sentence
#     if word.deprel == 'root':          # if a word is the root, its dependency relation label is 'root'. thus if this is true, the curr word = root word
        
#         root_verb = word.text          # save the root verb  
#         root_id = word.id              # get the words id 
        
#         for w in doc.sentences[0].words:                      # loop over words in the sentence
#             if w.head == root_id and w.deprel == 'nsubj':     # if words head attribute = root_id, then its a a direct dependent of the root (is a child of the root)
                
#                 subject = w.text                              # save the associated subject
        

# subj_verb_linkages = [subject, root_verb]

# print(f"Root Verb: {root_verb}, Subject: {subject}")

# subj_verb_linkages


### New ner() function

In [10]:
def ner(sentence): 
    
    entities = []
    verbs = []
    subjects = []
    subj_verb_linkages = []
    
    #Find the entities in the sentence
    words  = nltk.word_tokenize(sentence)        # break down the sentence into words
    tagged = nltk.pos_tag(words)                 # tag the words with Part of Speech 
    chunks = nltk.ne_chunk(tagged, binary=False) # binary = False named entities are classified (i.e PERSON, ORGANIZATION)
    
    for chunk in chunks:
        if hasattr(chunk, 'label'):              # hasattr(obj, key) -- checking if chunks have a label or not 
            entities.append(' '.join(c[0] for c in chunk)) # append entities to array
    
    
    #Find the verbs/subjects in the sentence
             # load in the spacy model
    doc = spacy_nlp(sentence)                          # create spacy doc object
    
    verbs = [token.text for token in doc if token.pos_ == "VERB"]     # traverse thru the tokens, find the verbs
    subjects = [token.text for token in doc if token.dep_ == "nsubj"]  # traverse thru the tokens, find the subjects
    
    
    #Find the Root Subject-verb linkages in the sentences using stanza
    doc = stanza_nlp(sentence)
    
    subject, root_verb = None, None
    
    for word in doc.sentences[0].words:    # for each word in the sentence
        if word.deprel == 'root':          # if a word is the root, its dependency relation label is 'root'. thus if this is true, the curr word = root word

            root_verb = word.text          # save the root verb  
            root_id = word.id              # get the words id 
            
            for w in doc.sentences[0].words:                      # loop over words in the sentence
                if w.head == root_id and w.deprel == 'nsubj':     # if words head attribute = root_id, then its a a direct dependent of the root (is a child of the root)
                    subject = w.text 
    
    if subject and root_verb: # not empty
        subj_verb_linkages = [subject, root_verb]   # subj_verb linkages array 
    
        
    return {'entities':entities, 
            'verbs':verbs,
            'subjects':subjects,
            'subj_verb_linkages':subj_verb_linkages} 

In [13]:
sentence = "Seller produces hot stamping foil which conforms and meets the Specification Requirements submitted, accepted and in Seller's possession for the Specification numbers listed attached in the Exhibit A., hereafter referred nto as 'Products'."


temp = ner('Buyer agrees to purchase foil requirements for current users, which are wholly owned subsidiaries of Baxter Healthcare Corporation.'
)

In [14]:
temp

{'entities': ['Buyer', 'Baxter Healthcare Corporation'],
 'verbs': ['agrees', 'purchase', 'owned'],
 'subjects': ['Buyer', 'which'],
 'subj_verb_linkages': ['Buyer', 'agrees']}