In [1]:
#!/usr/bin/env python3

import sys
import os
import logging
import json
import glob
import pandas as pd
import spacy
from stanfordcorenlp import StanfordCoreNLP

#sys.path.append('/Users/chilv/Documents/proj-wm/event_extraction/bert-event-extraction-master/ace2005-preprocessing-master')

from tqdm import tqdm
# from main import find_token_index
# from _parser import Parser
# import main

In [2]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser"])
max_length = os.getenv("MAX_DOCUMENT_LENGTH")
if max_length:
    nlp.max_length = int(max_length)

In [3]:
globbed_files = glob.glob("/Users/chilv/Documents/proj-wm/bias-stance/bias_stance/MITRE Six-Twelve Month and November Docs CDRs/*.cdr")
data = []
for one_file in globbed_files:
    frame = pd.read_json(one_file, lines=True)
    data.append(frame)

In [4]:
cdr_data = pd.concat(data, ignore_index = True, sort = False);

In [5]:
txt = cdr_data['extracted_text'][1]

In [6]:
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [7]:
text = "caitie is providing a couple of sample sentences. caitie might need some help from jewell or max."

In [8]:
doc = nlp(text)
doc

caitie is providing a couple of sample sentences. caitie might need some help from jewell or max.

In [9]:
def sentence_dict_list(doc):
    """Returns a list of dictionaries for each sentence in a CDR.
    This is just a few of those necessary for the model.
    """
    sentences = []
    for sent in doc.sents:
        sentence_dict = {}
        sentence_dict['sentence'] = sent.text
        sentence_dict['position'] = [sent.start_char, sent.end_char]
        entities = []
        for ent in sent.ents:
            entity_dict = {}
            entity_dict['text'] = ent.text
            entity_dict['position'] = [ent.start_char, ent.end_char]
            entity_dict['entity-type'] = ent.label_
            entities.append(entity_dict)
        sentence_dict['golden-entity-mentions'] = entities
        sentence_dict['golden-event-mentions'] = []
        sentences.append(sentence_dict)
    return sentences

In [12]:
sentences = sentence_dict_list(doc)
sentences

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'position': [0, 49],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG'}],
  'golden-event-mentions': []},
 {'sentence': 'caitie might need some help from jewell or max.',
  'position': [50, 97],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [50, 56],
    'entity-type': 'PERSON'},
   {'text': 'jewell', 'position': [83, 89], 'entity-type': 'PERSON'},
   {'text': 'max', 'position': [93, 96], 'entity-type': 'PERSON'}],
  'golden-event-mentions': []}]

In [13]:
class StanfordNLP:
    """Getting Stanford running with necessary annotators"""
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=60000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,parse',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def annotate(self, sentence):
        return self.nlp.annotate(sentence, properties=self.props)

In [14]:
Snlp = StanfordNLP()

In [15]:
def get_stanford_core_data(sentences):
    """Fills in the stanford core values needed for the model."""
    result = []
    for item in sentences:
        data = dict()
        data['sentence'] = item['sentence']
        data['golden-entity-mentions'] = item['golden-entity-mentions']
        data['golden-event-mentions'] = []
        try:
            nlp_res_raw = Snlp.annotate(item['sentence'])
            nlp_res = json.loads(nlp_res_raw)
            result.append(data)
        except Exception as e:
            print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
            print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
            continue
        tokens = nlp_res['sentences'][0]['tokens']
        data['stanford-colcc'] = []
        for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']:
            data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1))

        data['words'] = list(map(lambda x: x['word'], tokens))
        data['pos-tags'] = list(map(lambda x: x['pos'], tokens))
        data['lemma'] = list(map(lambda x: x['lemma'], tokens))
        data['parse'] = nlp_res['sentences'][0]['parse']
        result.append(data)
    return result

In [17]:
almost_there = get_stanford_core_data(sentences)
almost_there

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG'}],
  'golden-event-mentions': [],
  'stanford-colcc': ['ROOT/dep=2/gov=-1',
   'nsubj/dep=0/gov=2',
   'aux/dep=1/gov=2',
   'det:qmod/dep=3/gov=7',
   'mwe/dep=4/gov=3',
   'mwe/dep=5/gov=3',
   'compound/dep=6/gov=7',
   'dobj/dep=7/gov=2',
   'punct/dep=8/gov=2'],
  'words': ['caitie',
   'is',
   'providing',
   'a',
   'couple',
   'of',
   'sample',
   'sentences',
   '.'],
  'pos-tags': ['NN', 'VBZ', 'VBG', 'DT', 'NN', 'IN', 'NN', 'NNS', '.'],
  'lemma': ['caitie',
   'be',
   'provide',
   'a',
   'couple',
   'of',
   'sample',
   'sentence',
   '.'],
  'parse': '(ROOT\n  (S\n    (NP (NN caitie))\n    (VP (VBZ is)\n      (VP (VBG providing)\n        (NP\n          (NP (DT a) (NN couple))\n          (PP (IN of)\n            (NP (NN sample) (NNS sentences))))))\n    (. .)))'},
 {'sentence': 'caitie is providing a coup

In [119]:
def get_entity_token_index(sentences):
    result = []
    start_idx, end_idx = -1, -1
    for item in sentences:
        data = item   
        nlp_res_raw = Snlp.annotate(item['sentence'])
        nlp_res = json.loads(nlp_res_raw)
        tokens = nlp_res['sentences'][0]['tokens']
        for entity_mention in item['golden-entity-mentions']:
            if tokens['characterOffsetBegin'] == entity_mention['position'][0]:
                entity_mention['start'] = tokens['index']
            else:
                print("")                
                
        result.append(data)
    return result
        

In [120]:
get_entity_token_index(almost_there)

TypeError: list indices must be integers or slices, not str

In [121]:
%debug

> [0;32m<ipython-input-119-6de86cbf41f9>[0m(14)[0;36mget_entity_token_index[0;34m()[0m
[0;32m     12 [0;31m        [0mtokens[0m [0;34m=[0m [0mnlp_res[0m[0;34m[[0m[0;34m'sentences'[0m[0;34m][0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m[[0m[0;34m'tokens'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     13 [0;31m        [0;32mfor[0m [0mentity_mention[0m [0;32min[0m [0mitem[0m[0;34m[[0m[0;34m'golden-entity-mentions'[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m            [0;32mif[0m [0mtokens[0m[0;34m[[0m[0;34m'characterOffsetBegin'[0m[0;34m][0m [0;34m==[0m [0mentity_mention[0m[0;34m[[0m[0;34m'position'[0m[0;34m][0m[0;34m[[0m[0;36m0[0m[0;34m][0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m                [0mentity_mention[0m[0;34m[[0m[0;34m'start'[0m[0;34m][0m [0;34m=[0m [0mtokens[0m[0;34m[[0m[0;34m'index'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m

In [91]:
def get_tokens(sentences):
    tokens_all = []
    for item in sentences:
        try:
            nlp_res_raw = Snlp.annotate(item['sentence'])
            nlp_res = json.loads(nlp_res_raw)
        except Exception as e:
            print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
            continue
        tokens = nlp_res['sentences'][0]['tokens']
    return tokens

In [92]:
get_tokens(sentences)

[{'index': 1,
  'word': 'caitie',
  'originalText': 'caitie',
  'lemma': 'caitie',
  'characterOffsetBegin': 0,
  'characterOffsetEnd': 6,
  'pos': 'NN',
  'before': '',
  'after': ' '},
 {'index': 2,
  'word': 'might',
  'originalText': 'might',
  'lemma': 'might',
  'characterOffsetBegin': 7,
  'characterOffsetEnd': 12,
  'pos': 'MD',
  'before': ' ',
  'after': ' '},
 {'index': 3,
  'word': 'need',
  'originalText': 'need',
  'lemma': 'need',
  'characterOffsetBegin': 13,
  'characterOffsetEnd': 17,
  'pos': 'VB',
  'before': ' ',
  'after': ' '},
 {'index': 4,
  'word': 'some',
  'originalText': 'some',
  'lemma': 'some',
  'characterOffsetBegin': 18,
  'characterOffsetEnd': 22,
  'pos': 'DT',
  'before': ' ',
  'after': ' '},
 {'index': 5,
  'word': 'help',
  'originalText': 'help',
  'lemma': 'help',
  'characterOffsetBegin': 23,
  'characterOffsetEnd': 27,
  'pos': 'NN',
  'before': ' ',
  'after': ' '},
 {'index': 6,
  'word': 'from',
  'originalText': 'from',
  'lemma': 'from'

In [21]:
def preprocessing(data_type, files):
    result = []
    #event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0
    
    print('=' * 20)
    print('[preprocessing] type: ', data_type)
    for file in tqdm(files):
        #parser = Parser(path=file)
        #entity_count += len(parser.entity_mentions)
        #event_count += len(parser.event_mentions)
        #sent_count += len(parser.sents_with_pos)

        for item in sentences:
            data = dict()
            data['sentence'] = item['sentence']
            data['golden-entity-mentions'] = []
            data['golden-event-mentions'] = []

            try:
                nlp_res_raw = Snlp.annotate(item['sentence'])
                nlp_res = json.loads(nlp_res_raw)
            except Exception as e:
                print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
                print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
                continue

            tokens = nlp_res['sentences'][0]['tokens']

            if len(nlp_res['sentences']) >= 2:
                # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
                # This error occurred so little that it was temporarily ignored (< 20 sentences).
                continue

            data['stanford-colcc'] = []
            for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']:
                data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1))

            data['words'] = list(map(lambda x: x['word'], tokens))
            data['pos-tags'] = list(map(lambda x: x['pos'], tokens))
            data['lemma'] = list(map(lambda x: x['lemma'], tokens))
            data['parse'] = nlp_res['sentences'][0]['parse']

            sent_start_pos = item['position'][0]

            for entity_mention in item['golden-entity-mentions']:
               # position = entity_mention['position']
                start_idx, end_idx = find_token_index(
                    tokens=tokens,
                    start_pos= entity_mention['start'],
                    end_pos=entity_mention['end'],
                    phrase=entity_mention['text'],
                )

                entity_mention['start'] = entity_mention['position'][0]
                entity_mention['end'] = entity_mention['position'][1]

               # del entity_mention['position']

                data['golden-entity-mentions'].append(entity_mention)

            result.append(data)
            
   # return result #verify_result(result)
    with open('output/{}.json'.format(data_type), 'w') as f:
         json.dump(result, f, indent=2)

In [22]:
preprocessing("caitie", sentences)

  0%|          | 0/2 [00:00<?, ?it/s]

[preprocessing] type:  caitie


  0%|          | 0/2 [00:00<?, ?it/s]


KeyError: 'start'

In [44]:
%debug

> [0;32m<ipython-input-42-47c24e330399>[0m(49)[0;36mpreprocessing[0;34m()[0m
[0;32m     47 [0;31m                start_idx, end_idx = find_token_index(
[0m[0;32m     48 [0;31m                    [0mtokens[0m[0;34m=[0m[0mtokens[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 49 [0;31m                    [0mstart_pos[0m[0;34m=[0m [0mentity_mention[0m[0;34m[[0m[0;34m'start'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     50 [0;31m                    [0mend_pos[0m[0;34m=[0m[0mentity_mention[0m[0;34m[[0m[0;34m'end'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     51 [0;31m                    [0mphrase[0m[0;34m=[0m[0mentity_mention[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> entity_mention
{'text': 'caitie', 'position': [50, 56], 'entity-type': 'PERSON'}
ipdb> q


In [31]:
ls -ltr output

total 16
-rw-r--r--  1 chilv  staff  5410 Jan  6 14:48 caitie.json
