In [657]:
import glob
import xml.etree.ElementTree as ET
import re
import json
from nltk.tokenize import word_tokenize
import random
import math
from pycorenlp import StanfordCoreNLP

## Class Definitions:

In [658]:
class Argument(object):
    def __init__(self, id_, text, role, start, end, entity_type):
        self.id_ = id_
        self.text = text
        self.role = role
        self.start = start
        self.end = end
        self.entity_type = entity_type

    def to_string(self):
        return "Argument: {id_ = " + self.id_ + ", text = " + self.text + ", role = " + self.role + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [659]:
class Trigger(object):
    def __init__(self, start, text, end, id_, event_type):
        self.start = start
        self.text = text
        self.end = end
        self.id_ = id_
        self.event_type = event_type
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [660]:
class Entity(object):
    def __init__(self, id_, text, entity_type, phrase_type, start, end):
        self.id_ = id_
        self.text = text
        self.entity_type = entity_type
        self.phrase_type = phrase_type
        self.start = start
        self.end = end

    def to_string(self):
        return "Entity: {id_ = " + self.id_ + ", text = " + self.text + ", entity_type = " + self.entity_type + ", phrase_type=" + self.phrase_type + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [661]:
class Sentence(object):
    def __init__(self, text, start, end):
        self.text = text
        self.start = start
        self.end = end

    def to_string(self):
        return "Sentence: {text = " + self.text + ", start = " + self.start + ", end = " + self.end + "}"
    
    def __str__(self):
        return str(self.__dict__)

    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [662]:
class Event(object):
    def __init__(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end, scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities):
        self.event_id = event_id
        self.mention_id = mention_id
        self.type_ = type_
        self.subtype = subtype
        self.modality = modality
        self.polarity = polarity
        self.genericity = genericity
        self.tense = tense
        self.extent = extent
        self.extent_start = extent_start
        self.extent_end = extent_end
        self.scope = scope
        self.scope_start = scope_start
        self.scope_end = scope_end
        self.trig_text = trig_text
        self.trig_start = trig_start
        self.trig_end = trig_end
        self.arguments = arguments
        self.entities = entities


    def to_string(self):
        return "Event: { event_id = " + self.event_id + "mention_id = " + self.mention_id + ", type = " + self.type_ + ", subtype = " +self.subtype + ", modality = " \
               + self.modality + ", polarity = " + self.polarity + ", genericity= " + self.genericity + ", tense = " +\
               self.tense + ", extent = " +self.extent + ", scope = " + self.scope  + ", trigger = " + self.trig_text
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [663]:
stanford = StanfordCoreNLP('http://localhost:9001')

In [664]:
### Get parse, conll dependencies, list of words, list of pos, dictionary of offsets: (penn-treebank, stanford-colcc, words, offset_start, offset_end):
def find_dep_words_pos_offsets(sent):
    output = stanford.annotate(sent, properties={'annotators': 'tokenize,ssplit,pos,depparse,parse', 'outputFormat' : 'json'})
    #print("output=", output)
    #print("sent= ", sent)
    penn_treebank = output['sentences'][0]['parse'].replace("\n", "")
    triples = []
    for part in output['sentences'][0]['enhancedPlusPlusDependencies']:
         triples.append(part['dep']+"/dep="+str(part['dependent']-1)+"/gov="+str(part['governor']-1))
          
    words = []
    words_dict = {}
    pos_tags = []
    offset_start_dic = {}
    offset_end_dic = {}
    for i, word in enumerate(output['sentences'][0]['tokens']):
        words.append(word["word"])
        pos_tags.append(word["pos"])
        offset_start_dic.update({word["characterOffsetBegin"]: word["index"]-1})
        offset_end_dic.update({word["characterOffsetEnd"]-1: word["index"]})
        words_dict.update({word["index"]-1: word["word"]})
        
    return penn_treebank, triples, words, pos_tags, offset_start_dic, offset_end_dic, words_dict

## I. Annotation/Offset Extraction:

In [665]:
def extract_entity_info(entity, scope_start, scope_end, sent, words, offset_start_dic, offset_end_dic):
    entity_id = entity.attrib["ID"]
    phrase_type = entity.attrib["TYPE"] + ":" + entity.attrib["SUBTYPE"]
    entity_class = entity.attrib["CLASS"]
    #print("offset_start_dic:", offset_start_dic)
    #print("offset_end_dic:", offset_end_dic)
    entities = []
    for mention in entity.iter('entity_mention'):
        entity_type = mention.attrib["LDCTYPE"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    text = chil2.text
                    start = int(chil2.attrib["START"])
                    end = int(chil2.attrib["END"])
                    
        if scope_start <= start and  scope_end >= end:
            #print("offset_start_dic[start - scope_start]= ", offset_start_dic[start - scope_start])
            #print("offset_end_dic[end - scope_start]= ", offset_end_dic[end - scope_start])
            
            """
            if end - scope_start +1 in offset_end_dic:
                end_off = offset_end_dic[end - scope_start +1]
            elif end - scope_start + 2 in offset_end_dic:
                end_off = offset_end_dic[end - scope_start +2]
            elif end - scope_start in offset_end_dic:
                end_off = offset_end_dic[end - scope_start]
            elif end - scope_start - 1 in offset_end_dic:
                end_off = offset_end_dic[end - scope_start -1]
            elif end - scope_start - 2 in offset_end_dic:
                end_off = offset_end_dic[end - scope_start - 2]
                
            if start - scope_start +1 in offset_start_dic:
                start_off = offset_start_dic[start - scope_start +1]
            elif start - scope_start + 2 in offset_start_dic:
                start_off = offset_start_dic[start - scope_start +2]
            elif start - scope_start in offset_start_dic:
                start_off = offset_start_dic[start - scope_start]
            elif start - scope_start - 1 in offset_start_dic:
                start_off = offset_start_dic[start - scope_start -1]
            elif start - scope_start - 2 in offset_start_dic:
                start_off = offset_start_dic[start - scope_start - 2]  
                
            start_off = 0
            end_off = 2
            for i in range(-10, 10):
                if end - scope_start +i in offset_end_dic:
                    end_off = offset_end_dic[end - scope_start +i]

            for i in range(-10, 10):
                if start - scope_start +i in offset_start_dic:
                    start_off = offset_start_dic[start - scope_start +i]
            """
            try:
                try:
                    start_off = max(offset_start_dic[k] for k in offset_start_dic if k <= start - scope_start) #offset_start_dic[start - scope_start]
                except:
                    start_off = offset_start_dic[list(offset_start_dic.keys())[0]]
                #print("start_off:", start_off)
                try:
                    end_off = max(offset_end_dic[k] for k in offset_end_dic if k <= end - scope_start) #offset_end_dic[end - scope_start]
                except:
                    end_off = offset_end_dic[list(offset_start_dic.keys())[0]] + 1
                #print("end_off:", end_off)
                ent = Entity(entity_id, text, entity_type, phrase_type, start_off, end_off)
                entities.append(ent)

            except:
                print("Problematic sentence:", sent)
                print("words: ", words)
                print("offset_start_dic:", offset_start_dic)
                print("offset_end_dic:", offset_end_dic)
                print("entity:", text)
                print("start - scope_start:", start - scope_start)
                print("end - scope_start:", end - scope_start)
                continue
                
        
    return entities

In [666]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

In [667]:
def extract_event_info(root, event):
    event_id = event.attrib["ID"]
    event_type = event.attrib["TYPE"]
    subtype = event.attrib["SUBTYPE"]
    modality = event.attrib["MODALITY"]
    polarity = event.attrib["POLARITY"]
    genericity = event.attrib["GENERICITY"]
    tense = event.attrib["TENSE"]

    ## Looking at event mentions
    for mention in event.iter('event_mention'):
        mention_id = mention.attrib["ID"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    extent = chil2.text.replace("&", "&amp;")
                    extent_start = int(chil2.attrib["START"])
                    extent_end = int(chil2.attrib["END"]) 
                    sent = Sentence(extent, extent_start, extent_end)
                    penn_treebank, triples, words, pos_tags, offset_start_dic, offset_end_dic, words_dict = find_dep_words_pos_offsets(extent)
                                    
            ## SCOPE USED AS SENTENCE
            elif child.tag == "ldc_scope":
                for chil2 in child:
                    scope = chil2.text
                    scope_start = int(chil2.attrib["START"])
                    scope_end = int(chil2.attrib["END"])
                
            ## TRIGGER EXTRACTION
            elif child.tag == "anchor":
                for chil2 in child:
                    trig_text = chil2.text
                    start = int(chil2.attrib["START"]) - extent_start
                    end = int(chil2.attrib["END"]) - extent_start
                    
                    """
                    start_off = 0
                    end_off = 2
                    for i in range(0, 10):
                        if end +i in offset_end_dic:
                            end_off = offset_end_dic[end +i]
                            
                    for i in range(0, 10):
                        if start +i in offset_start_dic:
                            start_off = offset_start_dic[start +i]
                    
                    
                    elif end + 2 in offset_end_dic:
                        print(offset_end_dic)
                        end_off = offset_end_dic[end +2]
                    elif end -1 in offset_end_dic:
                        end_off = offset_end_dic[end -1]
                    elif end in offset_end_dic:
                        end_off = offset_end_dic[end]
                    elif end +2 in offset_end_dic:
                        print(offset_end_dic)
                        end_off = offset_end_dic[end -2]
                    
                    if start +1 in offset_start_dic:
                        start_off = offset_start_dic[start +1]
                    elif start +2 in offset_start_dic:
                        start_off = offset_start_dic[start +2]
                    elif start  in offset_start_dic:
                        start_off = offset_start_dic[start ]
                    elif start - 1 in offset_start_dic:
                        start_off = offset_start_dic[start -1]
                    elif start+2 in offset_start_dic:
                        start_off = offset_start_dic[start -2]
                    
                        
                    
                
                    trig_start = start_off
                    trig_end = end_off
                    """
            
                    
                    try:
                    
                        try:
                            trig_start = max(offset_start_dic[k] for k in offset_start_dic if k <= start) #offset_start_dic[start]
                        except:
                            trig_start = offset_start_dic[list(offset_start_dic.keys())[0]]
                        try:
                            trig_end = max(offset_end_dic[k] for k in offset_end_dic if k <= end)
                        except:
                            trig_end = offset_end_dic[list(offset_start_dic.keys())[0]] + 1
                        
                        
                    except:
                        print("Problematic sentence:", sent)
                        print("words: ", words)
                        print("offset_start_dic:", offset_start_dic)
                        print("offset_end_dic:", offset_end_dic)
                        print("trig_text:", trig_text)
                        print("start:", start)
                        print("end:", end)
                        trig_start = -1
                        trig_end = -1
                        continue
                    
        
        ## Looking at entity mentions with that same event
        entities = []
        ent_id_role_dict = {}
        for entity in root.iter('entity'):
            ents = extract_entity_info(entity, extent_start, extent_end, sent, words, offset_start_dic, offset_end_dic)
            entities.extend(ents)
            if len(ents) > 0:
                ent_id_role_dict.update({ents[0].id_: ents[0].entity_type})
        
        arguments = []           
        for argument in mention.iter('event_mention_argument'):
            arg_id = argument.attrib["REFID"]
            role = argument.attrib["ROLE"]
            for child in argument:
                for chil2 in child:
                    arg_text = chil2.text
                    start = int(chil2.attrib["START"]) - extent_start
                    end = int(chil2.attrib["END"]) - extent_start
                    
                    """
                    start_off = 0
                    end_off = 2
                    for i in range(0, 10):
                        if end +i in offset_end_dic:
                            end_off = offset_end_dic[end +i]
                            
                    for i in range(0, 10):
                        if start +i in offset_start_dic:
                            start_off = offset_start_dic[start +i]
                    
                    
                    arg_start = start_off
                    arg_end = end_off
                    """
                    
                    try:
                        try:
                            arg_start = max(offset_start_dic[k] for k in offset_start_dic if k <= start) #offset_start_dic[start]
                        except:
                            arg_start = offset_start_dic[list(offset_start_dic.keys())[0]]
                        try:
                            arg_end = max(offset_end_dic[k] for k in offset_end_dic if k <= end) #offset_end_dic[end]
                        except:
                            arg_end = offset_end_dic[list(offset_start_dic.keys())[0]] + 1
                            
                        if "-".join(arg_id.split("-")[:-1]) in ent_id_role_dict:
                            type_ = ent_id_role_dict["-".join(arg_id.split("-")[:-1])]
                        elif arg_id in ent_id_role_dict:
                            type_ = ent_id_role_dict[arg_id]
                        else:
                            type_ = "--"
                        arg = Argument(arg_id, arg_text, role, arg_start, arg_end,type_) 

                        arguments.append(arg)
                    except:
                        print("Problematic sentence:", sent)
                        print("words: ", words)
                        print("offset_start_dic:", offset_start_dic)
                        print("offset_end_dic:", offset_end_dic)
                        print("argument:", arg_text)
                        continue
                    
                    
       
    ev = Event(event_id, mention_id, event_type, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end,
               scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities)

    return sent, ev, penn_treebank, triples, words, pos_tags

In [680]:
from tqdm import tqdm
def extract_from_xml(root_path, files):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    for div in files: #tqdm(        
        # Get the event + argument annotation
        file_name = root_path + "English/"+ div.strip("\n") +".apf.xml"
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        for event in root.iter('event'):
            sent, ev, penn_treebank, triples, words, pos_tags = extract_event_info(root, event)
            if sent.text not in events:
                events.update({sent.text: {"events":[ev], "penn_treebank" : penn_treebank, "triples": triples, "words": words, "pos_tags" :pos_tags}}) 
            else:
                ev_list = events[sent.text]["events"]
                ev_list.append(ev)
                events[sent.text]["events"] = ev_list
                
        print("---------------------------------------------------------")
    return events, files_processed

In [681]:
#root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/LDC2006T06/data/"
root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/ACE2005-TrainingData-V6.0/"

In [682]:
languages = [file_.split("/")[-1] for file_ in glob.glob(root_path + "*") if "Icon\r" not in file_]

### Executing over the domains/languages:

In [683]:
files_splits = {}
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_training") as file:
    files_splits.update({"train": file.readlines()})

with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_dev") as file:
    files_splits.update({"dev": file.readlines()})
    
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_test") as file:
    files_splits.update({"test": file.readlines()})

In [688]:
events_lang_splits = {}
files_num = 0
for split in ["train", "dev", "test"]:
    events, files_processed = extract_from_xml(root_path, files_splits[split])
    files_num += files_processed
    events_lang_splits.update({split: events})

    print("Number of files processed for language= ", language, " is= ", files_num)


---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': 'Mr. Begala and Mr. Carville just donate', 'start': 1148, 'end': 1186}
words:  ['Mr.', 'Begala', 'and', 'Mr.', 'Carville', 'just', 'donate']
offset_start_dic: {0: 0, 4: 1, 11: 2, 15: 3, 19: 4, 28: 5, 33: 6}
offset_end_dic: {2: 1, 9: 2, 13: 3, 17: 4, 26: 5, 31: 6, 38: 7}
entity: Mr
start - scope_start: 0
end - scope_start: 1
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
-----

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': 'u.s. marines and residents exchanged gunfire in the\ncity of mosul today', 'start': 87, 'end': 157}
words:  ['u.s.', 'marines', 'and', 'residents', 'exchanged', 'gunfire', 'in', 'the', 'city', 'of', 'mosul', 'today']
offset_start_dic: {0: 0, 5: 1, 13: 2, 17: 3, 27: 4, 37: 5, 45: 6, 48: 7, 52: 8, 57: 9, 60: 10, 66: 11}
offset_end_dic: {3: 1, 11: 2, 15: 3, 25: 4, 35: 5, 43: 6, 46: 7, 50: 8, 55: 9, 58: 10, 64: 11, 70: 12}
entity: u.s
start - scope_start: 0
end - scope_start: 2
Problematic sentence: {'text': 'u.s. marines, who say they were responding to\nsnip

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': 'u.s. charges which have expired but could, i\nam told, possibly be re -- restarted for piracy, hostage\ntaking and conspiracy', 'start': 706, 'end': 828}
words:  ['u.s.', 'charges', 'which', 'have', 'expired', 'but', 'could', ',', 'i', 'am', 'told', ',', 'possibly', 'be', 're', '--', 'restarted', 'for', 'piracy', ',', 'hostage', 'taking', 'and', 'conspiracy']
offset_start_dic: {0: 0, 5: 1, 13: 2, 19: 3, 24: 4, 32: 5, 36: 6, 41: 7, 43: 8, 45: 9, 48: 10, 52: 11, 54: 12, 63: 13, 66: 14, 69: 15, 72: 16,

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': "u.s. forces still haven't found any weapons\nof mass destruction, the new cia pentagon report says\nofficials are now confident the mysterious trucks filled\nwith high-tech equipment found in iraq are indeed mobile\nbiological weapons production facilities, just as secretary\nof state powell predicted and presented the united nations\nbefore the war", 'start': 283, 'end': 627}
words:  ['u.s.', 'forces', 'still', 'have', "n't", 'found', 'any', 'weapons', 'of', 'mass', 'destruction', ',', 'the', 'new', 'cia', 'pentagon', 'report', 'says', 'officials', 'are', 'now', 'confident', 'the', 'mysterious', 'trucks', 'filled', 'with', 'high-tech', 'equipment', 'found', 'in', 'iraq', 'are', 'indeed', 'mobile', 'biological', 'weapons', 'production', 'facilities', ',', 'just', 'as', 'secretary', '

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
--------------

---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': "U.S. forces came under\n``significant'' fire from those buildings", 'start': 2480, 'end': 2543}
words:  ['U.S.', 'forces', 'came', 'under', '``', 'significant', "''", 'fire', 'from', 'those', 'buildings']
offset_start_dic: {0: 0, 5: 1, 12: 2, 17: 3, 23: 4, 25: 5, 36: 6, 39: 7, 44: 8, 49: 9, 55: 10}
offset_end_dic: {3: 1, 10: 2, 15: 3, 21: 4, 24: 5, 35: 6, 37: 7, 42: 8, 47: 9, 53: 10, 63: 11}
entity: U.S
start - scope_start: 0
end - scope_start: 2
Problematic sentence: {'text': 'U.S. forces tightened their grip throughout Baghdad, entering the\ncity for the first time from the north, seizing a military airport', 'start': 156, 'end': 287}
words:  ['U.S.', 'forces', 'tightened', 'their', 'grip', 'throughout', 'Baghdad', ',', 'entering', 'the', 'city', 'for', 'the', 'first', 'time', 'from', 'the', 'north', ',', 'seizing', 'a', 'military', 'airp

Problematic sentence: {'text': "U.S.-sponsored forum that brought Iraqi opposition leaders together\nto shape the country's postwar government began Tuesday with a U.S.\npromise not to rule Iraq", 'start': 147, 'end': 306}
words:  ['U.S.-sponsored', 'forum', 'that', 'brought', 'Iraqi', 'opposition', 'leaders', 'together', 'to', 'shape', 'the', 'country', "'s", 'postwar', 'government', 'began', 'Tuesday', 'with', 'a', 'U.S.', 'promise', 'not', 'to', 'rule', 'Iraq']
offset_start_dic: {0: 0, 15: 1, 21: 2, 26: 3, 34: 4, 40: 5, 51: 6, 59: 7, 68: 8, 71: 9, 77: 10, 81: 11, 88: 12, 91: 13, 99: 14, 110: 15, 116: 16, 124: 17, 129: 18, 131: 19, 136: 20, 144: 21, 148: 22, 151: 23, 156: 24}
offset_end_dic: {13: 1, 19: 2, 24: 3, 32: 4, 38: 5, 49: 6, 57: 7, 66: 8, 69: 9, 75: 10, 79: 11, 87: 12, 89: 13, 97: 14, 108: 15, 114: 16, 122: 17, 127: 18, 129: 19, 134: 20, 142: 21, 146: 22, 149: 23, 154: 24, 159: 25}
entity: U.S
start - scope_start: 0
end - scope_start: 2
Problematic sentence: {'text': "U.S.-s

Problematic sentence: {'text': 'U.S. special\noperations raids into western Iraq', 'start': 5523, 'end': 5569}
words:  ['U.S.', 'special', 'operations', 'raids', 'into', 'western', 'Iraq']
offset_start_dic: {0: 0, 5: 1, 13: 2, 24: 3, 30: 4, 35: 5, 43: 6}
offset_end_dic: {3: 1, 11: 2, 22: 3, 28: 4, 33: 5, 41: 6, 46: 7}
entity: U.S
start - scope_start: 0
end - scope_start: 2
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': 'U.S. President George W. Bush and South\nKorean President Roh Moo-hyun were about to meet Wednesday in\nWashington', 'start': 1201, 'end': 1312}
words:  ['U.S.', 'President', 'George', 'W.', 'Bush', 'and', 'South', 'Korean', 'President', 'Roh', 'Moo-hyun', 'were', 'about', 'to', 'meet', 'Wednesday', 'in', 'Washington']
offset_start_dic: {0: 0, 5: 1, 15: 2, 22: 3, 25: 4, 30: 5, 34: 6, 40: 7, 47: 8, 57: 9, 61: 10, 70

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic sentence: {'text': 'Sino-US war', 'start': 3044, 'end': 3054}
words:  ['Sino-US', 'war']
offset_start_dic: {0: 0, 8: 1}
offset_end_dic: {6: 1, 10: 2}
entity: Sino
start - scope_start: 0
end - scope_start: 3
Problematic sentence: {'tex

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
--------------

---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
---------------------------------------------------------
Problematic se

Problematic sentence: {'text': 'U.S. and British troops had encountered little\nresistance, even when they seized nearby Umm Qasr, and moved to\nsecure key oil fields', 'start': 1945, 'end': 2076}
words:  ['U.S.', 'and', 'British', 'troops', 'had', 'encountered', 'little', 'resistance', ',', 'even', 'when', 'they', 'seized', 'nearby', 'Umm', 'Qasr', ',', 'and', 'moved', 'to', 'secure', 'key', 'oil', 'fields']
offset_start_dic: {0: 0, 5: 1, 9: 2, 17: 3, 24: 4, 28: 5, 40: 6, 47: 7, 57: 8, 59: 9, 64: 10, 69: 11, 74: 12, 81: 13, 88: 14, 92: 15, 96: 16, 98: 17, 102: 18, 108: 19, 111: 20, 118: 21, 122: 22, 126: 23}
offset_end_dic: {3: 1, 7: 2, 15: 3, 22: 4, 26: 5, 38: 6, 45: 7, 56: 8, 57: 9, 62: 10, 67: 11, 72: 12, 79: 13, 86: 14, 90: 15, 95: 16, 96: 17, 100: 18, 106: 19, 109: 20, 116: 21, 120: 22, 124: 23, 131: 24}
entity: U.S
start - scope_start: 0
end - scope_start: 2
Problematic sentence: {'text': 'U.S. and British troops had encountered little\nresistance, even when they seized nearby U

In [687]:
events_lang_splits["train"]

{'الهجمات من الإسلامويين': {'events': [<__main__.Event at 0x1a19d01ac8>],
  'penn_treebank': '(ROOT  (X    (NP      (QP (CD الهجمات) (CD من))      (NN الإسلامويين))))',
  'triples': ['ROOT/dep=2/gov=-1',
   'compound/dep=0/gov=1',
   'nummod/dep=1/gov=2'],
  'words': ['الهجمات', 'من', 'الإسلامويين'],
  'pos_tags': ['CD', 'CD', 'NN']},
 'يتسلق جندي عراقي ينزف من جرح في بطنه انقاض بناية': {'events': [<__main__.Event at 0x1a1a2dee10>],
  'penn_treebank': '(ROOT  (NP    (NP      (NP        (QP (SYM يتسلق) (CD جندي) (CD عراقي)))      (NP (CD ينزف) (NN من)))    (NP      (NP (CD جرح) (NN في))      (NP        (QP (CD بطنه) (CD انقاض))        (CD بناية)))))',
  'triples': ['ROOT/dep=2/gov=-1',
   'dep/dep=0/gov=2',
   'compound/dep=1/gov=2',
   'nummod/dep=3/gov=4',
   'dep/dep=4/gov=2',
   'nummod/dep=5/gov=6',
   'dep/dep=6/gov=2',
   'compound/dep=7/gov=8',
   'nummod/dep=8/gov=9',
   'dep/dep=9/gov=6'],
  'words': ['يتسلق',
   'جندي',
   'عراقي',
   'ينزف',
   'من',
   'جرح',
   'في',
   'ب

In [494]:
#sent = "There was the free press in Qatar, al Jeezera but its' offices in\nKabul and Baghdad were bombed by Americans"
#sent = "Compare & Contrast GWB's Vision, Leadership, Excellent Management\nTalents, Endurance, Faith And Intelligence With LBJ's Lack Of Same In\nThe Vietnam War"
sent = "Compare &amp; Contrast GWB's Vision, Leadership, Excellent Management
Talents, Endurance, Faith And Intelligence With LBJ's Lack Of Same In
The Vietnam War"
start = 589 - 437
end = 591 - 437 
print(start)
print(sent[152:])
print(len(sent))
print(end)

I don't mean to suggest that I agree with anything David Frum says,
but these Arafat-had-AIDS allegations remind me of what happened with
the late Israeli singer Ofra Haza, where there was a lot of
speculation, a mysterious death from a 'blood disease,' and then it
came out eventually that she'd died of AIDS
152
li singer Ofra Haza, where there was a lot of
speculation, a mysterious death from a 'blood disease,' and then it
came out eventually that she'd died of AIDS
309
154


In [17]:
len(events_list_lang["English"])

3177

In [18]:
len(events_list_lang["Chinese"])

1931

In [19]:
len(events_list_lang["Arabic"])

1650

## Stanford Parser:

In [616]:
sent = "The skeleton of a second baby has been found at a rural Wisconsin home where a 22-year-old woman's dead infant was discovered in a blue container June 8, officials sais Monday."
penn_treebank, triples, words, pos_tags, offset_start_dic, offset_end_dic, words_dict \
= find_dep_words_pos_offsets(sent)
print(triples)

['ROOT/dep=9/gov=0', 'det/dep=1/gov=2', 'nsubj:pass/dep=2/gov=9', 'case/dep=3/gov=6', 'det/dep=4/gov=6', 'amod/dep=5/gov=6', 'nmod:of/dep=6/gov=2', 'aux/dep=7/gov=9', 'aux:pass/dep=8/gov=9', 'case/dep=10/gov=14', 'det/dep=11/gov=14', 'amod/dep=12/gov=14', 'compound/dep=13/gov=14', 'obl:at/dep=14/gov=9', 'advmod/dep=15/gov=23', 'det/dep=16/gov=18', 'amod/dep=17/gov=18', 'nmod:poss/dep=18/gov=21', 'case/dep=19/gov=18', 'amod/dep=20/gov=21', 'nsubj:pass/dep=21/gov=23', 'aux:pass/dep=22/gov=23', 'acl:relcl/dep=23/gov=14', 'case/dep=24/gov=27', 'det/dep=25/gov=27', 'amod/dep=26/gov=27', 'obl:in/dep=27/gov=23', 'obl:tmod/dep=28/gov=27', 'nummod/dep=29/gov=28', 'punct/dep=30/gov=27', 'appos/dep=31/gov=27', 'dep/dep=32/gov=31', 'obl:tmod/dep=33/gov=23', 'punct/dep=34/gov=9']


In [416]:
penn_treebank, triples, words, pos_tags, offset_start_dic, offset_end_dic, words_dict \
= find_dep_words_pos_offsets(sent)

print("words: ", words)
print("offset_start_dic: ", offset_start_dic)
print("offset_end_dic: ", offset_end_dic)

words:  ['Please', 'take', 'some', 'time', 'next', 'week', 'to', 'welcome', 'a', 'new', 'staff', 'member', 'for', 'Landmark', 'Education', 'in', 'Houston']
offset_start_dic:  {0: 0, 7: 1, 12: 2, 17: 3, 22: 4, 27: 5, 32: 6, 35: 7, 43: 8, 45: 9, 49: 10, 55: 11, 62: 12, 66: 13, 75: 14, 85: 15, 88: 16}
offset_end_dic:  {6: 1, 11: 2, 16: 3, 21: 4, 26: 5, 31: 6, 34: 7, 42: 8, 44: 9, 48: 10, 54: 11, 61: 12, 65: 13, 74: 14, 84: 15, 87: 16, 95: 17}


In [411]:
for sent in list(events_list_lang["English"])[0:2]:
    penn_treebank, triples, words, pos_tags, offset_start_dic, offset_end_dic, words_dict = find_dep_words_pos_offsets(sent)
    print("words: ", words)
    print("offset_start_dic: ", offset_start_dic)
    print("offset_end_dic: ", offset_end_dic)
    for event in events_list_lang["English"][sent]:
        for entity in event.entities:
            print("start:", entity.start, " end:", entity.end)
    print("------------------------------------------------\n\n")

SyntaxError: invalid syntax (<ipython-input-411-4b2401425358>, line 1)

## Create json object:

In [632]:
import json
from tqdm import tqdm

data_json_dic = {}
for lang in ['English', 'Chinese', 'Arabic']:
    sent_json = []
    data_json = []
    for sent in tqdm(events_list_lang[lang]):
        sent_json.append(sent)
        data_sub = {}
        data_sub["golden-event-mentions"] = []
        entities_unique = {}
        for event in events_list_lang[lang][sent]["events"]:
            event_info = {}
            #(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, 
            # tense, extent, scope, trig_text, trig_start, trig_end, arguments):

            event_info["trigger"] = {"start": event.trig_start, "end": event.trig_end, "text": event.trig_text}
            event_info["arguments"] = []
            for arg in event.arguments:
                arg_info = {"start": arg.start, "role": arg.role, "end": arg.end, "text": arg.text}
                event_info["arguments"].append(arg_info)

            event_info["id"] = event.event_id
            event_info["event_type"] = event.type_
            data_sub["golden-event-mentions"].append(event_info) 

            # Loading entities for that event and adding it to the list of entities
            for entity in event.entities:
                entities_unique.update({entity.id_:entity})

        data_sub["golden-entity-mentions"] = []
        for entity_id in entities_unique.keys():
            entity_info = {"phrase-type": entities_unique[entity_id].phrase_type, "end": entities_unique[entity_id].end, "text": entities_unique[entity_id].text, "entity-type": entities_unique[entity_id].entity_type, "start": entities_unique[entity_id].start, "id": entity_id}
            data_sub["golden-entity-mentions"].append(entity_info)

        #triples, words, pos_tags = find_stanford_colcc(sent)
        data_json.append({"penn_treebank": events_list_lang[lang][sent]["penn_treebank"], "stanford-colcc": events_list_lang[lang][sent]["triples"], "words": events_list_lang[lang][sent]["words"], "pos-tags" :events_list_lang[lang][sent]["pos_tags"], "golden-entity-mentions": data_sub["golden-entity-mentions"], "golden-event-mentions": data_sub["golden-event-mentions"]})
    data_json_dic.update({lang: data_json})




  0%|          | 0/3640 [00:00<?, ?it/s][A[A[A


100%|██████████| 3640/3640 [00:00<00:00, 54074.81it/s][A[A[A


  0%|          | 0/2436 [00:00<?, ?it/s][A[A[A


100%|██████████| 2436/2436 [00:00<00:00, 53262.39it/s][A[A[A


  0%|          | 0/2196 [00:00<?, ?it/s][A[A[A


100%|██████████| 2196/2196 [00:00<00:00, 53241.30it/s][A[A[A

## Train/Test/Dev:

In [633]:
import random
random.shuffle(data_json)

In [653]:
save_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/ace-05-splits/"
lang = "Arabic"

In [654]:
random.shuffle(data_json_dic[lang])
with open(save_path+ lang + '/train.json', 'w') as outfile:
    json.dump(data_json_dic[lang][:2000], outfile)

In [655]:
with open(save_path + lang+ '/dev.json', 'w') as outfile:
    json.dump(data_json_dic[lang][2000:2250], outfile)

In [656]:
with open(save_path + lang + '/test.json', 'w') as outfile:
    json.dump(data_json_dic[lang][2250:], outfile)