In [148]:
import glob
import xml.etree.ElementTree as ET
import re

In [49]:
class Event(object):
    def __init__(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, scope, trigger, arguments):
        self.event_id = event_id
        self.mention_id = mention_id
        self.type_ = type_
        self.subtype = subtype
        self.modality = modality
        self.polarity = polarity
        self.genericity = genericity
        self.tense = tense
        self.extent = extent
        self.scope = scope
        self.trigger = trigger
        self.arguments = arguments


    def to_string(self):
        return "Event: { event_id = " + self.event_id + "mention_id = " + self.mention_id + ", type = " + self.type_ + ", subtype = " +self.subtype + ", modality = " \
               + self.modality + ", polarity = " + self.polarity + ", genericity= " + self.genericity + ", tense = " +\
               self.tense + ", extent = " +self.extent + ", scope = " + self.scope  + ", trigger = " + self.trigger


In [50]:
class Argument(object):
    def __init__(self, id_, role, text):
        self.id_ = id_
        self.role = role
        self.text = text

    def to_string(self):
        return "Argument: {id_ = " + self.id_ + ", role = " + self.role + ", text = " + self.text + "}"


In [199]:
class Document(object):
    def __init__(self, id_, source, datetime, text):
        self.id_ = id_
        self.source = source
        self.datetime = datetime
        self.text = text

    def to_string(self):
        text_str = "["
        for t in self.text:
            text_str += t + ","
        text_str += "]"
        return "Document: {id_ = " + self.id_ + ", source = " + self.source + ", datetime = " + self.datetime  + ", text = " + text_str + "}"
    

In [59]:
data_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bc/adj/"
root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/"

file_name = "CNN_CF_20030303.1900.00.apf.xml"

In [66]:
languages = [file_.split("/")[-1] for file_ in glob.glob(root_path + "*") if "Icon\r" not in file_]

In [201]:
def extract_doc_info(root):
    for docid in root.iter('DOCID'):
        doc_id = docid.text

    for doctype in root.iter('DOCTYPE'):
        source = doctype.attrib['SOURCE']

    for datetime in root.iter('DATETIME'):
        datetime = datetime.text

    """
    for body in root.iter('BODY'):
        for headline in body.iter('HEADLINE'):
            headline = headline.text
            
    """

    turns = []
    for turn in root.iter('TURN'):
        for sp in root.iter('SPEAKER'):
            turns.append(sp.tail)
            
    return Document(doc_id, source, datetime, turns)

In [202]:
def extract_event_info(event):
    event_id = event.attrib["ID"]
    event_type = event.attrib["TYPE"]
    subtype = event.attrib["SUBTYPE"]
    modality = event.attrib["MODALITY"]
    polarity = event.attrib["POLARITY"]
    genericity = event.attrib["GENERICITY"]
    tense = event.attrib["TENSE"]

    for mention in event.iter('event_mention'):
        mention_id = mention.attrib["ID"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    extent = chil2.text
            elif child.tag == "ldc_scope":
                for chil2 in child:
                    scope = chil2.text
            elif child.tag == "anchor":
                for chil2 in child:
                    trigger = chil2.text

        arguments = []           
        for argument in mention.iter('event_mention_argument'):
            arg_id = argument.attrib["REFID"]
            role = argument.attrib["ROLE"]
            for child in argument:
                for chil2 in child:
                    arg_text = chil2.text
            arg = Argument(arg_id, role, arg_text)

            arguments.append(arg)

        ev = Event(event_id, mention_id, event_type, subtype, modality, polarity, genericity, tense, extent, scope, trigger, arguments)

    return ev

In [212]:
def extract_from_xml(root_path, language, domain):
    doc_events = []
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_num = 0
    for file_name in sorted(glob.glob(root_path + language + "/" + domain + "/adj/*.apf.xml")):
        # Get the raw document
        raw_path = root_path + language + "/" + domain + "/adj/" + file_name.split("/")[-1].split(".apf.xml")[0] + ".sgm"
        print(raw_path)
    
        tree =  ET.parse(raw_path, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        doc = extract_doc_info(root)
        
        # Get the event + argument annotation
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        events = []
        for event in root.iter('event'):
            events.append(extract_event_info(event))
            
        doc_events.append({"doc": doc, "events": events})
        
        files_num += 1
        
    return doc_events, files_num

In [216]:
doc_events_dict = {}
events_list_lang = {}

for language in languages:
    files_num = 0
    domains = [file_.split("/")[-1] for file_ in glob.glob(root_path + language + "/*" ) if "Icon\r" not in file_]
    events_lang = []
    for domain in domains:
        doc_events, num = extract_from_xml(root_path, language, domain)
        files_num += num
        for events_doc in doc_events:
            events_lang.extend(events_doc["events"])

    print("******** Language: "+language+ " Number of Processed files is: ", files_num)
        
    events_list_lang.update({language: events_lang})

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/Austin-Grad-Community_20050212.2454.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/Integritas-Group-Community-Forum_20050110.0557.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/alt.atheism_20041104.2428.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/alt.collecting.autographs_20050224.2438.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/alt.corel_20041228.0503.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/alt.gossip.celebrities_20041118.2331.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/un/adj/alt.politics.economics_20041206.1835.sgm
/Users/d22a

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29192.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29195.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29226.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29272.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29302.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29303.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29326.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/cts/adj/fsh_29336.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030410_193626.13.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030411_230640.38.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030415_193729.5.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030416_133739.13.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030416_133739.9.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030416_193742.26.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNNHL_ENG_20030416_193742.7.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Da

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030524_143511.4.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030525_143522.8.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030525_160525.13.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030526_133535.4.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030526_180540.6.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030526_183538.3.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/bn/adj/CNN_ENG_20030527_195948.3.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ac

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030502.0614.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030509.0345.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030514.0706.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030519.0049.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030519.0372.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030527.0616.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AFP_ENG_20030530.0132.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/English/nw/adj/AF

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050119.1316.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050121.1237.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050124.1829.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050124.1833.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050125.1321.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050125.1704.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20050127.1720.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/wl/adj/DAVYZW_20

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001130.1000.1061.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001203.1000.0378.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001205.1000.0150.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001205.1000.0731.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001206.1000.1074.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001207.1000.0835.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CBS20001212.1000.0797.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CB

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001118.1300.0757.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001118.1300.0859.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001119.1300.0564.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001120.1300.0506.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001120.1300.0737.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001120.1300.1052.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CTS20001121.1300.0182.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/CT

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001006.1800.2111.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001008.1800.0011.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001014.1800.0337.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001018.1800.0119.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001022.1800.0231.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001022.1800.1220.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VOM20001024.1800.1241.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/bn/adj/VO

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001122.1400.0074.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001122.2000.0154.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001123.0200.0009.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001123.0800.0075.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001124.0200.0001.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001124.0200.0025.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001124.1400.0090.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XI

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001224.0800.0046.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001224.1400.0066.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001224.2000.0076.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001225.0200.0001.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001225.0800.0058.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001225.2000.0134.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XIN20001226.0800.0050.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Chinese/nw/adj/XI

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/EBREAZ_20041212.0822.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/EGYDAYS_20050221.1227.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/EGYDAYS_20050226.1158.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/ENSANKHORDA_20041104.1315.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/ENSANKHORDA_20041118.1417.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/GLIMMERMAN_20050118.0339.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/HADOUTA_20041215.0106.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/wl/adj/

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001002.1300.0442.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001002.1300.0631.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001002.1300.1422.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001005.1300.0564.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001005.1300.0690.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001005.1300.0800.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR20001005.1300.1052.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/bn/adj/VAR2000100

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001014.0000.0011.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001014.0000.0041.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001014.1400.0153.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001014.1800.0188.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001016.0000.0009.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001017.1800.0245.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001018.0000.0006.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA2000101

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1000.0046.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1000.0059.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1000.0065.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1400.0073.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1400.0107.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1400.0118.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA20001208.1800.0145.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/AFA2000120

/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001124.1900.0127.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001125.0700.0024.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001128.1300.0081.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001201.1300.0071.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001201.1900.0126.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001209.0700.0031.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH20001210.1300.0101.sgm
/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Raw/ace_2005/data/Arabic/nw/adj/ALH2000121

In [211]:
len(events_list_lang["English"])+len(events_list_lang["Chinese"])+len(events_list_lang["Arabic"])

8129

In [234]:
help(events_list_lang["English"][4])

Help on Event in module __main__ object:

class Event(builtins.object)
 |  Event(event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, scope, trigger, arguments)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, scope, trigger, arguments)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  to_string(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [242]:
help(re)

Help on module re:

NAME
    re - Support for regular expressions (RE).

MODULE REFERENCE
    https://docs.python.org/3.7/library/re
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

DESCRIPTION
    This module provides regular expression matching operations similar to
    those found in Perl.  It supports both 8-bit and Unicode strings; both
    the pattern and the strings being processed can contain null bytes and
    characters outside the US ASCII range.
    
    Regular expressions can contain both special and ordinary characters.
    Most ordinary characters, like "A", "a", or "0", are the simplest
    regular expressions; they simply match themselves.  You can
    concatenate ordinary characters, so last mat

In [244]:
re.sub("\n", " ", events_list_lang["English"][0].scope)

'She will be the Registration Manager for the Houston Center, accountable for causing registrations into the Landmark Forum'

In [276]:
events_list_lang["English"]

AttributeError: 'list' object has no attribute 'trigger'

In [257]:
events_list_lang["English"][0].arguments[0].text

'She'

### Saving in text files: one for sentence event scopes (X), trigger labels (y):

In [261]:
import os
pre_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/ACE/Preprocessed/"

for language in languages:
    model_results_dir = pre_path+language+"/"
    if not os.path.isdir(model_results_dir):
        os.makedirs(model_results_dir)
        
    ## Event Scopes
    with open(model_results_dir+"event_scope.txt", "wt") as fout:
        for i in range(0, len(events_list_lang[language])):
            fout.write(re.sub("\n", " ", events_list_lang[language][i].scope)+"\n")
            
    ## Triggers
    with open(model_results_dir+"event_trigger.txt", "wt") as fout:
        for i in range(0, len(events_list_lang[language])):
            fout.write(re.sub("\n", " ", events_list_lang[language][i].trigger)+"\n")
            
    ## Event Types
    with open(model_results_dir+"event_type.txt", "wt") as fout:
        for i in range(0, len(events_list_lang[language])):
            fout.write(re.sub("\n", " ", events_list_lang[language][i].type_)+"\n")
            
    ## Arguments
    """
    with open(model_results_dir+"event_argument.txt", "wt") as fout:
        for i in range(0, len(events_list_lang[language])):
            fout.write(re.sub("\n", " ", events_list_lang[language][i].trigger)+"\n")
    """

 ## Argument_type (BIO notation):

In [272]:
events_list_lang["English"][0].scope

'She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum'

In [273]:
events_list_lang["English"][0].type_

'Personnel'

In [274]:
events_list_lang["English"][0].trigger

'be'

In [275]:
for arg in events_list_lang["English"][0].arguments:
    print("Text:", arg.text, "Role:", arg.role)

Text: She Role: Person
Text: Houston Role: Place
Text: the Houston Center Role: Entity
Text: Registration
Manager Role: Position


In [None]:
for arg in events_list_lang["English"][0].arguments:
    print("Text:", arg.text, "Role:", arg.role)