In [4]:
import pandas as pd
import re
import nltk
nltk.download('averaged_perceptron_tagger')
!pip install bllipparser
from bllipparser import RerankingParser
import spacy

rrp = RerankingParser.fetch_and_load('WSJ-PTB3', verbose=True)

nlp = spacy.load("en_core_web_sm") # Must run "python -m spacy download en" prior
ner = nlp.get_pipe("ner")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /afs/crc.nd.edu/user/k/kmealey2/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Defaulting to user installation because normal site-packages is not writeable
Collecting bllipparser
  Using cached bllipparser-2021.11.7-cp311-cp311-linux_x86_64.whl
Installing collected packages: bllipparser
Successfully installed bllipparser-2021.11.7
Model directory: /afs/crc.nd.edu/user/k/kmealey2/.local/share/bllipparser/WSJ-PTB3
Model directory already exists, not reinstalling


In [2]:
part = "The San Francisco Examiner issued a special edition around noon yesterday that was filled entirely with earthquake news and information."

Columns consist of:
1. words
2. PoS tags
3. base chunks
4.  clause
5.  named entitie
6.  target verb\
7-N propositions of target verbsme.

In [11]:
sentences = nltk.sent_tokenize(part)
sentence = sentences[0]
words = nltk.word_tokenize(sentence)
print(words)

['The', 'San', 'Francisco', 'Examiner', 'issued', 'a', 'special', 'edition', 'around', 'noon', 'yesterday', 'that', 'was', 'filled', 'entirely', 'with', 'earthquake', 'news', 'and', 'information', '.']


In [10]:
pos_tags = [tag[1] for tag in nltk.pos_tag(words)]
print(pos_tags)

['DT', 'NNP', 'NNP', 'NNP', 'VBD', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NN', 'WDT', 'VBD', 'VBN', 'RB', 'IN', 'NN', 'NN', 'CC', 'NN', '.']


In [12]:
parse_output = rrp.simple_parse(words)
print(parse_output)

(S1 (S (NP (DT The) (NNP San) (NNP Francisco) (NNP Examiner)) (VP (VBD issued) (NP (DT a) (JJ special) (NN edition)) (PP (IN around) (NP (NN noon))) (NP (NN yesterday)) (SBAR (WHNP (WDT that)) (S (VP (VBD was) (VP (VBN filled) (ADVP (RB entirely)) (PP (IN with) (NP (NN earthquake) (NN news) (CC and) (NN information)))))))) (. .)))


In [38]:
def get_bio_chunks(parse_output, words):
    """ Input
            parse_output: string type output of the RerankingParser from https://github.com/BLLIP/bllip-parser/tree/master (Charniak parser)
            words: [string] list of words in a sentence which was inputted to the RerankingParser in order to obtain its output

        Output
            [TO-DO]
            
    """
    bio_out = []
    depth = 0
    phrase_type = ''

    for word in words:
        
        if word == "(":
            word = "-LRB-"
        elif word == ")":
            word = "-RRB-"
        elif word == "?":
            word = r'\?'

        word = word.replace("^", "\^")
        
        try:
            mo = re.match(r'(.*?)(\([A-Z#.:,$-`]+ '+word+r'\))(.*)', parse_output)
        except:
            print(parse_output, words)
        if not mo:
            parse_bits.append("")
            continue
        
        groups = mo.groups()

        bio = ''

        # find '(NP' and '(VP' etc starts of phrases
        phrase_start = groups[0]

        depth = depth + phrase_start.count('(')

        b_p_mo = re.match(f'.*\(([A-Z]+P).*', phrase_start)
        if b_p_mo:
            phrase_type = b_p_mo.groups()[0]
            bio = 'B-'+phrase_type
        else:
            if depth == 0:
                bio = 'O'
            else:
                bio = 'I-'+phrase_type

        bio_out.append(bio)
            

        # handle remainder of to prep for loop restart
        rem = groups[-1].strip()
    
        paran_mo = re.match(r'(\)+)(.*)', rem)
        paran_groups = []
        if paran_mo:
            paran_groups = paran_mo.groups()
            depth = depth - paran_groups[0].count(')')
            rem = paran_groups[-1]
    
        # split output
        parse_output = rem

    return bio_out

In [39]:
bio_chunks = get_bio_chunks(parse_output, words)
print(bio_chunks)

['B-NP', 'I-NP', 'I-NP', 'I-NP', 'B-VP', 'B-NP', 'I-NP', 'I-NP', 'B-PP', 'B-NP', 'B-NP', 'B-WHNP', 'B-VP', 'B-VP', 'B-ADVP', 'B-PP', 'B-NP', 'I-NP', 'I-NP', 'I-NP', 'I-NP']


In [72]:
def get_s2e_clauses(parse_output, words):
    """ Input
            parse_output: string type output of the RerankingParser from https://github.com/BLLIP/bllip-parser/tree/master (Charniak parser)
            words: [string] list of words in a sentence which was inputted to the RerankingParser in order to obtain its output

        Output
            [TO-DO]
            
    """
    s2e_out = []
    depth = 0
    clause_start_depths = []

    for word in words:
        
        if word == "(":
            word = "-LRB-"
        elif word == ")":
            word = "-RRB-"
        elif word == "?":
            word = r'\?'

        word = word.replace("^", "\^")
        
        try:
            mo = re.match(r'(.*?)(\([A-Z#.:,$-`]+ '+word+r'\))(.*)', parse_output)
        except:
            print(parse_output, words)
        if not mo:
            parse_bits.append("")
            continue
        
        groups = mo.groups() 

        clause = '*'

        # find '(S' starts of clauses
        
        phrase_start = groups[0]
        phrase_start = phrase_start.replace('(S1 ','(TOP ') # don't want to count top 'S1' phrase

        s_mo = re.match(f'(.*\(S).*', phrase_start)
        if s_mo:
            clause = '(S*'
            clause_start_depths.append(depth + s_mo.groups()[0].count('('))

        depth = depth + phrase_start.count('(')

        # handle remainder of to prep for loop restart
        rem = groups[-1].strip()
    
        paran_mo = re.match(r'(\)+)(.*)', rem)
        paran_groups = []
        if paran_mo:
            paran_groups = paran_mo.groups()
            depth = depth - paran_groups[0].count(')')
            rem = paran_groups[-1]
    
        # split output
        parse_output = rem

        pop = None
        for start_depth in clause_start_depths[::-1]:
            if depth < start_depth:
                pop = (pop if type(pop) == int else 0)- 1
                clause = clause + 'S)'
        clause_start_depths = clause_start_depths[:pop]

        s2e_out.append(clause)

        #print(f"depth: {depth}, clause_depths = {clause_start_depths}, phrase_start = {phrase_start}, ignored {groups[1]}, paran_groups = {paran_groups}")

    return s2e_out

In [73]:
s2e_clauses = get_s2e_clauses(parse_output, words)
print(s2e_clauses)

['(S*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '*', '(S*', '(S*', '*', '*', '*', '*', '*', '*', '*S)S)', '*S)']


In [78]:
def get_ner_labels(words):

    """ Input: words: [string] tokenized sentence
        Output:
            ner_labels: [string] label corresponding to each word in words. Words with no NER tag are labeled with an "*", words an NER tag are labeled with "(LABEL)".
            If a label spans multiple words, the first word is labeled with "(LABEL*", the following with "*", and the last with "*)"
            This is according to CONLL-12 format
    """

    ner_labels = ["O" for word in words]

    doc = spacy.tokens.Doc(nlp.vocab, words=words)
    
    for ent in ner(doc).ents:
        ent_label = ent.label_
        if ent.end - ent.start > 1:
            ner_labels[ent.start] = f"B-{ent_label}"
            for idx in range(ent.start+1, ent.end):
                ner_labels[idx] = f"I-{ent_label}"
        else:
            ner_labels[ent.start] = f"B-{ent_label}"
    
    return ner_labels

In [80]:
ner_labels = get_ner_labels(words)
print(ner_labels)

['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'B-TIME', 'B-DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [85]:
!pip install torch

Defaulting to user installation because normal site-packages is not writeable


In [88]:
!pip install allennlp

Defaulting to user installation because normal site-packages is not writeable
Collecting allennlp
  Obtaining dependency information for allennlp from https://files.pythonhosted.org/packages/d3/80/25f92a8dcc47d86da251766107aa39dae1b903f5cfaddbf0e7cf33bc5957/allennlp-2.10.1-py3-none-any.whl.metadata
  Using cached allennlp-2.10.1-py3-none-any.whl.metadata (21 kB)
INFO: pip is looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
  Obtaining dependency information for allennlp from https://files.pythonhosted.org/packages/13/56/c26f4dd6f33fa817115b7cc7e919a754c4b18db82cccd207cd539e59742f/allennlp-2.10.0-py3-none-any.whl.metadata
  Using cached allennlp-2.10.0-py3-none-any.whl.metadata (20 kB)
  Obtaining dependency information for allennlp from https://files.pythonhosted.org/packages/a0/93/e377dbc50df72e0d7c5fcc604e1c90c85f6639d3d2aa391fbb1306ea2da9/allennlp-2.9.3-py3-none-any.whl.metadata
  Using cached allennl