In [1]:
#!/usr/bin/env python3

import sys
import os
import logging
import json
import glob
import pandas as pd
import spacy
from stanfordcorenlp import StanfordCoreNLP

sys.path.append('/Users/chilv/Documents/proj-wm/event_extraction/bert-event-extraction-master/ace2005-preprocessing-master')

from tqdm import tqdm
from main import find_token_index
from _parser import Parser
import main

In [2]:
nlp = spacy.load('en_core_web_lg', disable=["tagger", "parser"])
max_length = os.getenv("MAX_DOCUMENT_LENGTH")
if max_length:
    nlp.max_length = int(max_length)

In [3]:
globbed_files = glob.glob("/Users/chilv/Documents/proj-wm/bias-stance/bias_stance/MITRE Six-Twelve Month and November Docs CDRs/*.cdr")
data = []
for one_file in globbed_files:
    frame = pd.read_json(one_file, lines=True)
    data.append(frame)

In [4]:
cdr_data = pd.concat(data, ignore_index = True, sort = False)
cdr_data.head()

Unnamed: 0,capture_source,extracted_metadata,content_type,team,document_id,extracted_text,uri,source_uri,extracted_ntriples,timestamp,annotations,categories,extracted_numeric
0,BackgroundSource,"{'CreationDate': '2017-09-14', 'ModDate': '201...",application/pdf,Two Six Labs,ee0e47a89787f974467b5118885fdb06,GIEWS global information and early warning sys...,http://graph.causeex.com/documents/sources#ee0...,ee0e47a89787f974467b5118885fdb06.pdf,<http://graph.causeex.com/documents/sources#ee...,2019-09-24 12:23:38+00:00,"[{'type': 'tags', 'label': 'Qntfy Event detect...","[November 2019 SSudan Docs, Six-Month Evaluati...",
1,BackgroundSource,"{'CreationDate': '2014-08-05', 'ModDate': '201...",application/pdf,Two Six Labs,bd5386a9044fb2a16783e60da70d74e3,GOVERNMENT OF THE REPUBLIC OF SOUTH SUDAN PART...,http://graph.causeex.com/documents/sources#bd5...,bd5386a9044fb2a16783e60da70d74e3.pdf,<http://graph.causeex.com/documents/sources#bd...,2019-09-24 14:24:57+00:00,"[{'type': 'tags', 'label': 'Qntfy Event detect...",[Twelve-Month Eval Docs],
2,BackgroundSource,"{'CreationDate': '2018-04-13', 'ModDate': '201...",application/pdf,Two Six Labs,cacfddfc1b3e72cb0a23c3b6dd3e81c4,Shelter NFI Cluster South Sudan\n\nShelter/NFI...,http://graph.causeex.com/documents/sources#cac...,cacfddfc1b3e72cb0a23c3b6dd3e81c4.pdf,<http://graph.causeex.com/documents/sources#ca...,2019-09-24 12:23:41+00:00,"[{'type': 'tags', 'label': 'Qntfy Event detect...",[November 2019 SSudan Docs],
3,BackgroundSource,"{'CreationDate': '2017-06-09', 'ModDate': '201...",application/pdf,Two Six Labs,0fc79680cfa986f5f0c0321d52134e3d,"Abathok, Abyei Intentions Survey Intentions Su...",http://graph.causeex.com/documents/sources#0fc...,0fc79680cfa986f5f0c0321d52134e3d.pdf,<http://graph.causeex.com/documents/sources#0f...,2019-09-24 12:26:57+00:00,"[{'type': 'tags', 'label': 'Qntfy NER', 'versi...",[November 2019 SSudan Docs],
4,BackgroundSource,"{'CreationDate': '2017-04-20', 'ModDate': '201...",application/pdf,Two Six Labs,b82f4e5c37a793a08fc93a2cace2b03b,In this issue Thousands flee Jonglei clashes P...,http://graph.causeex.com/documents/sources#b82...,b82f4e5c37a793a08fc93a2cace2b03b.pdf,<http://graph.causeex.com/documents/sources#b8...,2019-09-24 12:27:03+00:00,"[{'type': 'tags', 'label': 'Qntfy NER', 'versi...",[November 2019 SSudan Docs],


In [5]:
txt = cdr_data['extracted_text'][1]

In [6]:
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

In [7]:
text = "caitie is providing a couple of sample sentences. caitie might need some help from jewell or max."

In [8]:
doc = nlp(text)

In [9]:
sentences = []
for sent in doc.sents:
    sentence_dict = {}
    sentence_dict['sentence'] = sent.text
    sentence_dict['position'] = [sent.start_char, sent.end_char]
    entities = []
    for ent in sent.ents:
        entity_dict = {}
        entity_dict['text'] = ent.text
        entity_dict['position'] = [ent.start_char, ent.end_char]
        entity_dict['entity-type'] = ent.label_
        entities.append(entity_dict)
    sentence_dict['golden-entity-mentions'] = entities
    sentence_dict['golden-event-mentions'] = []
    sentences.append(sentence_dict)

In [10]:
sentences

[{'sentence': 'caitie is providing a couple of sample sentences.',
  'position': [0, 49],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [0, 6],
    'entity-type': 'ORG'}],
  'golden-event-mentions': []},
 {'sentence': 'caitie might need some help from jewell or max.',
  'position': [50, 97],
  'golden-entity-mentions': [{'text': 'caitie',
    'position': [50, 56],
    'entity-type': 'PERSON'},
   {'text': 'jewell', 'position': [83, 89], 'entity-type': 'PERSON'},
   {'text': 'max', 'position': [93, 96], 'entity-type': 'PERSON'}],
  'golden-event-mentions': []}]

In [11]:
class StanfordNLP:
    def __init__(self, host='http://localhost', port=9000):
        self.nlp = StanfordCoreNLP(host, port=port,
                                   timeout=60000)  # , quiet=False, logging_level=logging.DEBUG)
        self.props = {
            'annotators': 'tokenize,ssplit,pos,lemma,parse',
            'pipelineLanguage': 'en',
            'outputFormat': 'json'
        }

    def word_tokenize(self, sentence):
        return self.nlp.word_tokenize(sentence)

    def pos(self, sentence):
        return self.nlp.pos_tag(sentence)

    def parse(self, sentence):
        return self.nlp.parse(sentence)

    def annotate(self, sentence):
        return self.nlp.annotate(sentence, properties=self.props)

    @staticmethod
    def tokens_to_dict(_tokens):
        tokens = defaultdict(dict)
        for token in _tokens:
            tokens[int(token['index'])] = {
                'word': token['word'],
                'lemma': token['lemma'],
                'pos': token['pos'],
                'ner': token['ner']
            }
        return tokens

In [12]:
Snlp = StanfordNLP()

In [13]:
def preprocessing(data_type, files):
    result = []
    #event_count, entity_count, sent_count, argument_count = 0, 0, 0, 0
    
    print('=' * 20)
    print('[preprocessing] type: ', data_type)
    for file in tqdm(files):
        #parser = Parser(path=file)
        #entity_count += len(parser.entity_mentions)
        #event_count += len(parser.event_mentions)
        #sent_count += len(parser.sents_with_pos)

        for item in sentences:
            data = dict()
            data['sentence'] = item['sentence']
            data['golden-entity-mentions'] = []
            data['golden-event-mentions'] = []

            try:
                nlp_res_raw = Snlp.annotate(item['sentence'])
                nlp_res = json.loads(nlp_res_raw)
            except Exception as e:
                print('[Warning] StanfordCore Exception: ', nlp_res_raw, 'This sentence will be ignored.')
                print('If you want to include all sentences, please refer to this issue: https://github.com/nlpcl-lab/ace2005-preprocessing/issues/1')
                continue

            tokens = nlp_res['sentences'][0]['tokens']

            if len(nlp_res['sentences']) >= 2:
                # TODO: issue where the sentence segmentation of NTLK and StandfordCoreNLP do not match
                # This error occurred so little that it was temporarily ignored (< 20 sentences).
                continue

            data['stanford-colcc'] = []
            for dep in nlp_res['sentences'][0]['enhancedPlusPlusDependencies']:
                data['stanford-colcc'].append('{}/dep={}/gov={}'.format(dep['dep'], dep['dependent'] - 1, dep['governor'] - 1))

            data['words'] = list(map(lambda x: x['word'], tokens))
            data['pos-tags'] = list(map(lambda x: x['pos'], tokens))
            data['lemma'] = list(map(lambda x: x['lemma'], tokens))
            data['parse'] = nlp_res['sentences'][0]['parse']

            sent_start_pos = item['position'][0]

            for entity_mention in item['golden-entity-mentions']:
               # position = entity_mention['position']
                start_idx, end_idx = find_token_index(
                    tokens=tokens,
                    start_pos= entity_mention['start'],
                    end_pos=entity_mention['end'],
                    phrase=entity_mention['text'],
                )

                entity_mention['start'] = entity_mention['position'][0]
                entity_mention['end'] = entity_mention['position'][1]

               # del entity_mention['position']

                data['golden-entity-mentions'].append(entity_mention)

#             for event_mention in item['golden-event-mentions']:
#                 # same event mention can be shared
#                 event_mention = copy.deepcopy(event_mention)
#                 position = event_mention['trigger']['position']
#                 start_idx, end_idx = find_token_index(
#                     tokens=tokens,
#                     start_pos=position[0] - sent_start_pos,
#                     end_pos=position[1] - sent_start_pos + 1,
#                     phrase=event_mention['trigger']['text'],
#                 )

#                 event_mention['trigger']['start'] = start_idx
#                 event_mention['trigger']['end'] = end_idx
#                 del event_mention['trigger']['position']
#                 del event_mention['position']

#                 arguments = []
#                 argument_count += len(event_mention['arguments'])
#                 for argument in event_mention['arguments']:
#                     position = argument['position']
#                     start_idx, end_idx = find_token_index(
#                         tokens=tokens,
#                         start_pos=position[0] - sent_start_pos,
#                         end_pos=position[1] - sent_start_pos + 1,
#                         phrase=argument['text'],
#                     )

#                     argument['start'] = start_idx
#                     argument['end'] = end_idx
#                     del argument['position']

#                     arguments.append(argument)

#                 event_mention['arguments'] = arguments
#                 data['golden-event-mentions'].append(event_mention)

            result.append(data)
            
   # return result #verify_result(result)
    with open('output/{}.json'.format(data_type), 'w') as f:
         json.dump(result, f, indent=2)

In [14]:
preprocessing("caitie", sentences)

 50%|█████     | 1/2 [00:00<00:00,  5.09it/s]

[preprocessing] type:  caitie


100%|██████████| 2/2 [00:00<00:00,  5.96it/s]


In [44]:
%debug

> [0;32m<ipython-input-42-47c24e330399>[0m(49)[0;36mpreprocessing[0;34m()[0m
[0;32m     47 [0;31m                start_idx, end_idx = find_token_index(
[0m[0;32m     48 [0;31m                    [0mtokens[0m[0;34m=[0m[0mtokens[0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 49 [0;31m                    [0mstart_pos[0m[0;34m=[0m [0mentity_mention[0m[0;34m[[0m[0;34m'start'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     50 [0;31m                    [0mend_pos[0m[0;34m=[0m[0mentity_mention[0m[0;34m[[0m[0;34m'end'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     51 [0;31m                    [0mphrase[0m[0;34m=[0m[0mentity_mention[0m[0;34m[[0m[0;34m'text'[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> entity_mention
{'text': 'caitie', 'position': [50, 56], 'entity-type': 'PERSON'}
ipdb> q


In [31]:
ls -ltr output

total 16
-rw-r--r--  1 chilv  staff  5410 Jan  6 14:48 caitie.json
