In [5]:
import pandas as pd
import stanza
from stanza.server import CoreNLPClient

In [8]:
PIPE = stanza.Pipeline(lang="en", processors="tokenize,mwt,pos,lemma,depparse", verbose=False)
ROOT = 'C:\\Users\\timjo\\PycharmProjects\\newscript'

In [9]:
df = pd.read_pickle(ROOT + "\\src\\chambers11\\matrices\\dev_dummy.pkl")

In [10]:
with CoreNLPClient(endpoint='http://localhost:8000', timeout=30000, memory='4G', be_quiet=True) as client:
    annotations = [client.annotate(doc) for doc in df['text']]

In [11]:
df['text'][0]

'Salvadoran President Alfredo Cristiani today postponed his trip to the United States, which was scheduled for 16-23 January, until later this month, according to an announcement today by information Secretary Mauricio Sandoval . The Presidential spokesman explained that the postponement of Cristiani\'s trip is due to the President\'s interest in meeting with UN Secretary General Javier Perez de Cuellar to request that Perez de Cuellar act as mediator to achieve the resumption of dialogue with the Salvadoran guerrillas .``We have received information from New York to the effect that Perez de Cuellar will not be available on the dates initially set for the trip, thus, Cristiani\'s visit has been postponed, Sandoval said . According to Sandoval, Cristiani will meet with Perez de Cuellar on 31 January, after which he will go to Washington in early February to meet with U.S. President George Bush . Sandoval denied that the postponement of Cristiani\'s trip is related to possible reaction t

In [32]:
annotations[0].sentence[5]

token {
  word: "``"
  pos: "``"
  value: "``"
  before: ""
  after: ""
  originalText: "``"
  ner: "O"
  lemma: "``"
  beginChar: 1149
  endChar: 1151
  utterance: 0
  speaker: "PER0"
  tokenBeginIndex: 203
  tokenEndIndex: 204
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"
  fineGrainedNER: "O"
  nerLabelProbs: "O=0.99999834481452"
}
token {
  word: "There"
  pos: "EX"
  value: "There"
  before: ""
  after: " "
  originalText: "There"
  ner: "O"
  lemma: "there"
  beginChar: 1151
  endChar: 1156
  utterance: 2
  speaker: "62"
  tokenBeginIndex: 204
  tokenEndIndex: 205
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"
  fineGrainedNER: "O"
  nerLabelProbs: "O=0.999960090240432"
}
token {
  word: "is"
  pos: "VBZ"
  value: "is"
  before: " "
  after: " "
  originalText: "is"
  ner: "O"
  lemma: "be"
  beginChar: 1157
  endChar: 1159
  utterance: 2
  speaker: "62"
  tokenBeginIndex: 205
  tokenEndIndex: 206
  hasXmlContext: false
  isNewline: false
  coarseNER: "O"

In [27]:
class event_pattern:
    """Contains all data of event patterns necessary to allow template slot induction"""

    def __init__(self, lemma: str):
        self.lemma = lemma
        
        # for coreference resoluation
        self.sub_locs = []  # [doc.id, sent.id, token.id]
        self.obj_locs = []  # [doc.id, sent.id, token.id]

        # for selection preferences
        self.sub_lemmas = []
        self.obj_lemmas = []

    def add_sub(self, loc, lemma):
        self.sub_locs.append(loc)
        self.sub_lemmas.append(lemma)

    def add_obj(self, loc, lemma):
        self.obj_locs.append(loc)
        self.obj_lemmas.append(lemma)
        
def extract_event_patterns(annotations):
    """Use CoreNLP annotation to extract event patterns

    Chambers & Jurafsky (pp. 978) define event patterns as either:
    1a) a verb ("explode")
    1b) a verb and the head word of its syntactic object ("explode:bomb")
    2) a noun in WordNet under the Event synset ("explosion")
    
    :param annotations: list of CoreNLPClient annotations of documents

    :returns: list of event_pattern class instances
    """

    # 1) find verbs and their arguments
    event_patterns = {}
    for ann in annotations:
        doc_id = annotations.index(ann)

        for sent in ann.sentence:
            sent_id = sent.basicDependencies.node[0].sentenceIndex

            for token in sent.token:  # for each word
                if 'VB' in token.pos:
                    print(f'Found verb: {token.lemma}!')
                    if token.lemma not in event_patterns:
                        # create new event pattern
                        print('Not found in dict yet, creating new EP object')
                        event_patterns[token.lemma] = event_pattern(lemma=token.lemma)

                    # search for subjects and objects
                    for e in sent.basicDependencies.edge:
                        if ('sub' in e.dep or 'obj' in e.dep) and e.source == token.tokenEndIndex:
                            print(f'Found subject or object for {token.lemma}')

                            # get lemma/NER tag
                            sub_obj_token = sent.token[e.target - 1]
                            if sub_obj_token.coarseNER == "O":
                                lemma = sub_obj_token.lemma
                            else:
                                lemma = sub_obj_token.coarseNER
                            print(f'Found lemma {lemma} for subject/object of {token.lemma}')

                            # get location info
                            loc = [doc_id, sent_id, e.target - 1]  # e.target index starts at 1

                            # add to subject or object to event pattern
                            if 'sub' in e.dep:
                                print(f'{lemma} identified as subject of {token.lemma}')
                                event_patterns[token.lemma].add_sub(loc, lemma)
                            else:
                                print(f'{lemma} identified as object of {token.lemma}')
                                event_patterns[token.lemma].add_obj(loc, lemma)
                    print('----------------------------------------------------------------')



    # 2) find nouns that are in the WordNet Event synsets
    # TODO: implement
    # event_nouns = h.get_event_nouns()

    return event_patterns

In [28]:
eps = extract_event_patterns(annotations)

Found verb: postpone!
Not found in dict yet, creating new EP object
Found subject or object for postpone
Found lemma PERSON for subject/object of postpone
PERSON identified as subject of postpone
Found subject or object for postpone
Found lemma trip for subject/object of postpone
trip identified as object of postpone
----------------------------------------------------------------
Found verb: be!
Not found in dict yet, creating new EP object
----------------------------------------------------------------
Found verb: schedule!
Not found in dict yet, creating new EP object
Found subject or object for schedule
Found lemma which for subject/object of schedule
which identified as subject of schedule
----------------------------------------------------------------
Found verb: accord!
Not found in dict yet, creating new EP object
----------------------------------------------------------------
Found verb: explain!
Not found in dict yet, creating new EP object
--------------------------------

In [45]:
subjects = [sub_lem.lower() for value in eps.values() for sub_lem in value.sub_lemmas]
objects = [obj_lem.lower() for value in eps.values() for obj_lem in value.obj_lemmas]
entities = sorted(list(set(subjects + objects)))
entities

['attack',
 'bomb',
 'government',
 'guerrilla',
 'intelligence',
 'it',
 'mayor',
 'member',
 'officer',
 'person',
 'presence',
 'source',
 'station',
 'survivor',
 'they',
 'trip',
 'which']