In [1]:
#!/usr/bin/env python3

import sys
import os
import logging
import json
import glob
import pandas as pd
import spacy
from stanfordcorenlp import StanfordCoreNLP
import re


#sys.path.append('/Users/chilv/Documents/proj-wm/event_extraction/bert-event-extraction-master/ace2005-preprocessing-master')

from tqdm import tqdm
# from main import find_token_index
# from _parser import Parser
# import main

In [2]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser"])
max_length = os.getenv("MAX_DOCUMENT_LENGTH")
if max_length:
    nlp.max_length = int(max_length)

In [3]:
# globbed_files = glob.glob("/Users/chilv/Documents/proj-wm/bias-stance/bias_stance/MITRE Six-Twelve Month and November Docs CDRs/*.cdr")
# data = []
# for one_file in globbed_files:
#     frame = pd.read_json(one_file, lines=True)
#     data.append(frame)

In [4]:
# cdr_data = pd.concat(data, ignore_index = True, sort = False);

In [5]:
# txt = cdr_data['extracted_text'][1]

In [6]:
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [7]:
text = "caitie is providing a couple of sample sentences. caitie might need some help from jewell or max."

In [8]:
text

'caitie is providing a couple of sample sentences. caitie might need some help from jewell or max.'

In [9]:
doc = nlp(text)

In [10]:
def sentence_dict_list(doc):
    """Returns a list of dictionaries for each sentence in a CDR.
    This is just a few of those necessary for the model.
    """
    sentences = []
    for sent in doc.sents:
        sentence_dict = {}
        sentence_dict['sentence'] = sent.text
        sentence_dict['position'] = [sent.start_char, sent.end_char]
        entities = []
        for ent in sent.ents:
            entity_dict = {}
            entity_dict['text'] = ent.text
            entity_dict['position'] = [ent.start_char, ent.end_char]
            entity_dict['entity-type'] = ent.label_
            entities.append(entity_dict)
        sentence_dict['golden-entity-mentions'] = entities
        sentence_dict['golden-event-mentions'] = []
        sentences.append(sentence_dict)
    return sentences

In [11]:
sentences = sentence_dict_list(doc)
sentences

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'position': [0, 49],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG'}],
  'golden-event-mentions': []},
 {'sentence': 'caitie might need some help from jewell or max.',
  'position': [50, 97],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [50, 56],
    'entity-type': 'PERSON'},
   {'text': 'jewell', 'position': [83, 89], 'entity-type': 'PERSON'},
   {'text': 'max', 'position': [93, 96], 'entity-type': 'PERSON'}],
  'golden-event-mentions': []}]

In [12]:
class StanfordNLP:
    """Getting Stanford running with necessary annotators"""
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=60000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,parse',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def annotate(self, sentence):
        return self.nlp.annotate(sentence, properties=self.props)

In [13]:
Snlp = StanfordNLP()

In [14]:
def get_stanford_core_data(sentences):
    """Fills in the stanford core values needed for the model."""
    result = []
    for item in sentences:
        data = dict()
        data['sentence'] = item['sentence']
        data['position'] = item['position']
        data['golden-entity-mentions'] = item['golden-entity-mentions']
        data['golden-event-mentions'] = []
        try:
            nlp_res_raw = Snlp.annotate(item['sentence'])
            nlp_res = json.loads(nlp_res_raw)
            result.append(data)
        except Exception as e:
            print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
            print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
            continue
        tokens = nlp_res['sentences'][0]['tokens']
        data['stanford-colcc'] = []
        for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']:
            data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1))

        data['words'] = list(map(lambda x: x['word'], tokens))
        data['pos-tags'] = list(map(lambda x: x['pos'], tokens))
        data['lemma'] = list(map(lambda x: x['lemma'], tokens))
        data['parse'] = nlp_res['sentences'][0]['parse']
        result.append(data)
    return result

In [15]:
post11 = get_stanford_core_data(sentences)

In [16]:
post11

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'position': [0, 49],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG'}],
  'golden-event-mentions': [],
  'stanford-colcc': ['ROOT/dep=2/gov=-1',
   'nsubj/dep=0/gov=2',
   'aux/dep=1/gov=2',
   'det:qmod/dep=3/gov=7',
   'mwe/dep=4/gov=3',
   'mwe/dep=5/gov=3',
   'compound/dep=6/gov=7',
   'dobj/dep=7/gov=2',
   'punct/dep=8/gov=2'],
  'words': ['caitie',
   'is',
   'providing',
   'a',
   'couple',
   'of',
   'sample',
   'sentences',
   '.'],
  'pos-tags': ['NN', 'VBZ', 'VBG', 'DT', 'NN', 'IN', 'NN', 'NNS', '.'],
  'lemma': ['caitie',
   'be',
   'provide',
   'a',
   'couple',
   'of',
   'sample',
   'sentence',
   '.'],
  'parse': '(ROOT\n  (S\n    (NP (NN caitie))\n    (VP (VBZ is)\n      (VP (VBG providing)\n        (NP\n          (NP (DT a) (NN couple))\n          (PP (IN of)\n            (NP (NN sample) (NNS sentences))))))\n    (. .)))'},
 {'sentence': 'cai

In [17]:
def find_token_index(tokens, start_pos, end_pos, phrase):
    start_idx, end_idx = -1, -1
    for idx, token in enumerate(tokens):
        if token['characterOffsetBegin'] <= start_pos:
            start_idx = idx

    assert start_idx != -1, "start_idx: {}, start_pos: {}, phrase: {}, tokens: {}".format(start_idx, start_pos, phrase, tokens)
    chars = ''

    def remove_punc(s):
        s = re.sub(r'[^\w]', '', s)
        return s

    for i in range(0, len(tokens) - start_idx):
        chars += remove_punc(tokens[start_idx + i]['originalText'])
        if remove_punc(phrase) in chars:
            end_idx = start_idx + i + 1
            break

    assert end_idx != -1, "end_idx: {}, end_pos: {}, phrase: {}, tokens: {}, chars:{}".format(end_idx, end_pos, phrase, tokens, chars)
    return start_idx, end_idx

In [18]:
# for item in sentences:
    
#     data = item 
    
#     nlp_res_raw = Snlp.annotate(item['sentence'])
#     nlp_res = json.loads(nlp_res_raw)
#     tokens = nlp_res['sentences'][0]['tokens']
    
#     sent_start_pos = item['position'][0]
    
#     for entity_mention in item['golden-entity-mentions']:
        
#         position = entity_mention['position']
        
#         start_idx, end_idx = find_token_index(
#             tokens=tokens,
#             start_pos=position[0] - sent_start_pos,
#             end_pos=position[1] - sent_start_pos + 1,
#             phrase=entity_mention['text'],
#     )

#         entity_mention['start'] = start_idx
#         entity_mention['end'] = end_idx

#         del entity_mention['position']
    
#     data['golden-entity-mentions'].append(entity_mention)
        
#     results.append(data)

In [19]:
def fix_entity_index(item):
        
    nlp_res_raw = Snlp.annotate(item['sentence'])
    nlp_res = json.loads(nlp_res_raw)
    tokens = nlp_res['sentences'][0]['tokens']

    sent_start_pos = item['position'][0]

    for entity_mention in item['golden-entity-mentions']:

        position = entity_mention['position']

        start_idx, end_idx = find_token_index(
            tokens=tokens,
            start_pos=position[0] - sent_start_pos,
            end_pos=position[1] - sent_start_pos + 1,
            phrase=entity_mention['text'],
    )

        entity_mention['start'] = start_idx
        entity_mention['end'] = end_idx

       # del entity_mention['position']

    #item['golden-entity-mentions'].append(entity_mention)

    return item

In [20]:
def fix_entity_indices(sentences):
    
    results = []
    
    for item in sentences:
        
        result = fix_entity_index(item)
        
        results.append(result)
        
    return results

In [21]:
print(sentences[0])

{'sentence': 'caitie is providing a couple of sample sentences.', 'position': [0, 49], 'golden-entity-mentions': [{'text': 'caitie', 'position': [0, 6], 'entity-type': 'ORG'}], 'golden-event-mentions': []}


In [22]:
print(post11[0])

{'sentence': 'caitie is providing a couple of sample sentences.', 'position': [0, 49], 'golden-entity-mentions': [{'text': 'caitie', 'position': [0, 6], 'entity-type': 'ORG'}], 'golden-event-mentions': [], 'stanford-colcc': ['ROOT/dep=2/gov=-1', 'nsubj/dep=0/gov=2', 'aux/dep=1/gov=2', 'det:qmod/dep=3/gov=7', 'mwe/dep=4/gov=3', 'mwe/dep=5/gov=3', 'compound/dep=6/gov=7', 'dobj/dep=7/gov=2', 'punct/dep=8/gov=2'], 'words': ['caitie', 'is', 'providing', 'a', 'couple', 'of', 'sample', 'sentences', '.'], 'pos-tags': ['NN', 'VBZ', 'VBG', 'DT', 'NN', 'IN', 'NN', 'NNS', '.'], 'lemma': ['caitie', 'be', 'provide', 'a', 'couple', 'of', 'sample', 'sentence', '.'], 'parse': '(ROOT\n  (S\n    (NP (NN caitie))\n    (VP (VBZ is)\n      (VP (VBG providing)\n        (NP\n          (NP (DT a) (NN couple))\n          (PP (IN of)\n            (NP (NN sample) (NNS sentences))))))\n    (. .)))'}


In [23]:
reusltzzz = fix_entity_indices(post11)

In [24]:
reusltzzz

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'position': [0, 49],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG',
    'start': 0,
    'end': 1}],
  'golden-event-mentions': [],
  'stanford-colcc': ['ROOT/dep=2/gov=-1',
   'nsubj/dep=0/gov=2',
   'aux/dep=1/gov=2',
   'det:qmod/dep=3/gov=7',
   'mwe/dep=4/gov=3',
   'mwe/dep=5/gov=3',
   'compound/dep=6/gov=7',
   'dobj/dep=7/gov=2',
   'punct/dep=8/gov=2'],
  'words': ['caitie',
   'is',
   'providing',
   'a',
   'couple',
   'of',
   'sample',
   'sentences',
   '.'],
  'pos-tags': ['NN', 'VBZ', 'VBG', 'DT', 'NN', 'IN', 'NN', 'NNS', '.'],
  'lemma': ['caitie',
   'be',
   'provide',
   'a',
   'couple',
   'of',
   'sample',
   'sentence',
   '.'],
  'parse': '(ROOT\n  (S\n    (NP (NN caitie))\n    (VP (VBZ is)\n      (VP (VBG providing)\n        (NP\n          (NP (DT a) (NN couple))\n          (PP (IN of)\n            (NP (NN sample) (NNS sentences))))))\n   

In [25]:
##fix the above function so it returns all of the already existing values