In [210]:
import glob
import xml.etree.ElementTree as ET
import re
import json
from nltk.tokenize import word_tokenize
import random
import math
from tqdm import tqdm

## Class Definitions:

In [211]:
class Argument(object):
    def __init__(self, id_, text, role, start, end):
        self.id_ = id_
        self.text = text
        self.role = role
        self.start = start
        self.end = end

    def to_string(self):
        return "Argument: {id_ = " + self.id_ + ", text = " + self.text + ", role = " + self.role + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [212]:
class Trigger(object):
    def __init__(self, start, text, end, id_, event_type):
        self.start = start
        self.text = text
        self.end = end
        self.id_ = id_
        self.event_type = event_type
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [213]:
class Entity(object):
    def __init__(self, id_, text, entity_type, phrase_type, start, end):
        self.id_ = id_
        self.text = text
        self.entity_type = entity_type
        self.phrase_type = phrase_type
        self.start = start
        self.end = end

    def to_string(self):
        return "Entity: {id_ = " + self.id_ + ", text = " + self.text + ", entity_type = " + self.entity_type + ", phrase_type=" + self.phrase_type + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [214]:
class Sentence(object):
    def __init__(self, text, start, end):
        self.text = text
        self.start = start
        self.end = end

    def to_string(self):
        return "Sentence: {text = " + self.text + ", start = " + self.start + ", end = " + self.end + "}"
    
    def __str__(self):
        return str(self.__dict__)

    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [215]:
class Event(object):
    def __init__(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end, scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities):
        self.event_id = event_id
        self.mention_id = mention_id
        self.type_ = type_
        self.subtype = subtype
        self.modality = modality
        self.polarity = polarity
        self.genericity = genericity
        self.tense = tense
        self.extent = extent
        self.extent_start = extent_start
        self.extent_end = extent_end
        self.scope = scope
        self.scope_start = scope_start
        self.scope_end = scope_end
        self.trig_text = trig_text
        self.trig_start = trig_start
        self.trig_end = trig_end
        self.arguments = arguments
        self.entities = entities


    def to_string(self):
        return "Event: { event_id = " + self.event_id + "mention_id = " + self.mention_id + ", type = " + self.type_ + ", subtype = " +self.subtype + ", modality = " \
               + self.modality + ", polarity = " + self.polarity + ", genericity= " + self.genericity + ", tense = " +\
               self.tense + ", extent = " +self.extent + ", scope = " + self.scope  + ", trigger = " + self.trig_text
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

## I. Annotation/Offset Extraction:

In [216]:
def extract_entity_info(entity, scope_start, scope_end):
    entity_id = entity.attrib["ID"]
    phrase_type = entity.attrib["TYPE"] + ":" + entity.attrib["SUBTYPE"]
    entity_class = entity.attrib["CLASS"]

    entities = []
    for mention in entity.iter('entity_mention'):
        entity_type = mention.attrib["LDCTYPE"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    text = chil2.text
                    start = int(chil2.attrib["START"])
                    end = int(chil2.attrib["END"])
                    
        if scope_start <= start and  scope_end >= end:      
            ent = Entity(entity_id, text, entity_type, phrase_type, start - scope_start, end - scope_start)
            entities.append(ent)
        
    return entities

In [217]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

In [218]:
def extract_event_info(root, event):
    event_id = event.attrib["ID"]
    event_type = event.attrib["TYPE"]
    subtype = event.attrib["SUBTYPE"]
    modality = event.attrib["MODALITY"]
    polarity = event.attrib["POLARITY"]
    genericity = event.attrib["GENERICITY"]
    tense = event.attrib["TENSE"]

    ## Looking at event mentions
    for mention in event.iter('event_mention'):
        mention_id = mention.attrib["ID"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    extent = chil2.text
                    extent_start = int(chil2.attrib["START"])
                    extent_end = int(chil2.attrib["END"]) 
                
            ## SCOPE USED AS SENTENCE
            elif child.tag == "ldc_scope":
                for chil2 in child:
                    scope = chil2.text
                    scope_start = int(chil2.attrib["START"])
                    scope_end = int(chil2.attrib["END"])
                sent = Sentence(scope, scope_start, scope_end)
                
            ## TRIGGER EXTRACTION
            elif child.tag == "anchor":
                for chil2 in child:
                    trig_text = chil2.text
                    trig_start = int(chil2.attrib["START"]) - scope_start
                    trig_end = int(chil2.attrib["END"]) - scope_start
        

        arguments = []           
        for argument in mention.iter('event_mention_argument'):
            arg_id = argument.attrib["REFID"]
            role = argument.attrib["ROLE"]
            for child in argument:
                for chil2 in child:
                    arg_text = chil2.text
                    arg_start = int(chil2.attrib["START"]) - scope_start
                    arg_end = int(chil2.attrib["END"]) - scope_start
                        
            arg = Argument(arg_id, arg_text, role , arg_start, arg_end) 

            arguments.append(arg)
       
        ## Looking at entity mentions with that same event
        entities = []
        for entity in root.iter('entity'):
            entities.extend(extract_entity_info(entity, scope_start, scope_end))

    ev = Event(event_id, mention_id, event_type, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end,
               scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities)

    return sent, ev

In [219]:
def extract_from_xml(root_path, language, domain):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    if language == "English":
        subpath = "timex2norm"
    else:
        subpath = "adj"
        
    for file_name in sorted(glob.glob(root_path + language + "/" + domain + "/"+subpath+"/*.apf.xml")):        
        # Get the event + argument annotation
        #print("file_name=", file_name)
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        for event in root.iter('event'):
            sent, ev = extract_event_info(root, event)
            if sent.text not in events:
                events.update({sent.text: [ev]}) 
            else:
                ev_list = events[sent.text]
                ev_list.append(ev)
                events.update({sent.text: ev_list})
                
        
    return events, files_processed

In [220]:
def extract_from_xml_english(root_path, files):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    for div in tqdm(files): #       
        # Get the event + argument annotation
        file_name = root_path + "English/"+ div.strip("\n") +".apf.xml"
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        for event in root.iter('event'):
            sent, ev = extract_event_info(root, event)
            if sent.text not in events:
                events.update({sent.text: [ev]}) 
            else:
                ev_list = events[sent.text]
                ev_list.append(ev)
                events.update({sent.text: ev_list})
    return events, files_processed

In [221]:
#root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/LDC2006T06/data/"
root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/ACE2005-TrainingData-V6.0/"



In [222]:
languages = [file_.split("/")[-1] for file_ in glob.glob(root_path + "*") if "Icon\r" not in file_]


### Executing over the domains/languages:

In [223]:
files_splits = {}
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_training") as file:
    files_splits.update({"train": file.readlines()})

with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_dev") as file:
    files_splits.update({"dev": file.readlines()})
    
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_test") as file:
    files_splits.update({"test": file.readlines()})

In [224]:
events_lang_splits = {}
events_list_lang = {}
for language in ["English", "Chinese", "Arabic"]:
    print("Processing language: ", language)
    if language == "English":
        for split in ["train", "dev", "test"]:
            print("Processing train split")
            events, files_processed = extract_from_xml_english(root_path, files_splits[split])
            events_lang_splits.update({split: events})

            print("Number of files processed for language= ", language, " is= ", files_processed)
    else:
        domains = [file_.split("/")[-1] for file_ in glob.glob(root_path + language + "/*" ) if "Icon\r" not in file_]
        events_lang = {}
        for domain in domains:
            events, files_processed = extract_from_xml(root_path, language, domain)

            events_lang = merge_two_dicts(events_lang, events)
        events_list_lang.update({language: events_lang})

  1%|          | 5/529 [00:00<00:12, 43.65it/s]

Processing language:  English
Processing train split


100%|██████████| 529/529 [00:07<00:00, 68.71it/s] 
  7%|▋         | 2/30 [00:00<00:01, 15.90it/s]

Number of files processed for language=  English  is=  529
Processing train split


100%|██████████| 30/30 [00:00<00:00, 42.76it/s]
 18%|█▊        | 7/40 [00:00<00:00, 67.52it/s]

Number of files processed for language=  English  is=  30
Processing train split


100%|██████████| 40/40 [00:00<00:00, 71.14it/s]


Number of files processed for language=  English  is=  40
Processing language:  Chinese
Processing language:  Arabic


In [225]:
len(events_lang_splits["train"])

2690

In [226]:
len(events_lang_splits["dev"])

269

In [227]:
len(events_lang_splits["test"])

220

In [228]:
len(events_list_lang["English"])

KeyError: 'English'

In [229]:
len(events_list_lang["Chinese"])

1931

In [230]:
len(events_list_lang["Arabic"])

1650

## I. Preparing new data for JMEE:

### Entities with words:

In [22]:
new_ent_lang_dict = {}
for lang in languages:
    new_ent = {}
    for sent in events_list_lang[lang].keys():
        new_entities = []
        events_ent = []
        for event in events_list_lang[lang][sent]:
            events_ent.extend(event.entities)

        events_ent.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for entity in events_ent:
            if entity.start >= end:
                new_entities.append(entity)
            end = entity.end
            #print(argument.start, " ", argument.end, " ", argument.role)

        new_ent.update({sent: new_entities})
    new_ent_lang_dict.update({lang: new_ent})

In [31]:
def get_entities_positions(sent, language, new_ent_lang_dict):
    words = {}
    entities = []
    i = 0
    start = 0
    for entity in new_ent_lang_dict[language][sent]:
        end = entity.start

        for word in word_tokenize(sent[start:end]):
            words.update({i: word})
            i += 1

        ### Tokenize trigger part and annotate each word as 'B' or 'I'
        start = entity.start
        end = entity.end+1 
        print("start=", start)
        entity_tok = word_tokenize(sent[start:end])
        print("Entity text= ", sent[start:end])
        pos_start = i
        print("start i =", i)
        for word in entity_tok:
            print("word:", word)
            words.update({i:word})
            i += 1
        pos_end = i
        
        print("i=", i)
        
        entities.append({"phrase-type": entity.phrase_type, "end": pos_end, "text": entity.text, "entity-type": entity.entity_type, "start": pos_start, " id": entity.id_})
        start = entity.end + 1
        
    for word in word_tokenize(sent[start:]):
        words.update({i: word})
        i += 1
        
    return words, entities, triggers

In [32]:
sent = 'She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum'
words, entities = get_entities_positions(sent, "English", new_ent_lang_dict)

start= 0
Entity text=  She
start i = 0
word: She
i= 1
start= 12
Entity text=  the Registration
Manager for the Houston Center
start i = 3
word: the
word: Registration
word: Manager
word: for
word: the
word: Houston
word: Center
i= 10
start= 108
Entity text=  Landmark
start i = 17
word: Landmark
i= 18


In [33]:
list(new_ent_lang_dict["English"].keys())[0]

'She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum'

In [34]:
words

{0: 'She',
 1: 'will',
 2: 'be',
 3: 'the',
 4: 'Registration',
 5: 'Manager',
 6: 'for',
 7: 'the',
 8: 'Houston',
 9: 'Center',
 10: ',',
 11: 'accountable',
 12: 'for',
 13: 'causing',
 14: 'registrations',
 15: 'into',
 16: 'the',
 17: 'Landmark',
 18: 'Forum'}

In [35]:
entities

[{'phrase-type': 'PER:Individual',
  'end': 1,
  'text': 'She',
  'entity-type': 'PRO',
  'start': 0,
  ' id': 'Austin-Grad-Community_20050212.2454-E31'},
 {'phrase-type': 'PER:Individual',
  'end': 10,
  'text': 'the Registration\nManager for the Houston Center',
  'entity-type': 'NOM',
  'start': 3,
  ' id': 'Austin-Grad-Community_20050212.2454-E31'},
 {'phrase-type': 'ORG:Educational',
  'end': 18,
  'text': 'Landmark',
  'entity-type': 'NAMPRE',
  'start': 17,
  ' id': 'Austin-Grad-Community_20050212.2454-E2'}]

### Triggers with words:

In [46]:
events_list_lang["English"][sent][1].arguments

[<__main__.Argument at 0x1a1a280e80>, <__main__.Argument at 0x1a1a280940>]

In [55]:
new_trig_lang_dict = {}
for lang in languages:
    new_trig = {}
    for sent in events_list_lang[lang].keys():
        triggers = []
        new_triggers = []
        for event in events_list_lang[lang][sent]:
            triggers.append(Trigger(event.trig_start, event.trig_text, event.trig_end, event.event_id, event.type_+":"+event.subtype))
        triggers.sort(key=lambda x: x.start, reverse=False)
        end = 0
        for trig in triggers:
            if trig.start >= end:
                new_triggers.append(trig)
            end = trig.end
        new_trig.update({sent: new_triggers})
    new_trig_lang_dict.update({language: new_trig})

In [None]:
new_arg_lang_dict = {}
for lang in languages:
    for sent in events_list_lang[lang].keys():
        new_event_arg = []
        for event in events_list_lang[lang][sent]:
            event_args = event.arguments
            event_args.sort(key=lambda x: x.start, reverse=False)
            
            end = 0
            new_arguments = []
            for argument in event_args:
                if argument.start >= end:
                    new_arguments.append(argument)
                end = argument.end
            
            new_event_arg.append({"arguments": new_arguments, })

In [None]:
new_lang_dict = {}
for lang in languages:
    new_arg = {}
    for sent in events_list_lang[lang].keys():
        new_arguments = []
        events_arg = []
        for event in events_list_lang[lang][sent]:
            events_arg.extend(event.arguments)

        events_arg.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for argument in events_arg:
            if argument.start >= end:
                new_arguments.append(argument)
            end = argument.end
            #print(argument.start, " ", argument.end, " ", argument.role)

        new_arg.update({sent: new_arguments})
    new_lang_dict.update({lang: new_arg})

## II. BIO Annotation for Triggers:

In [231]:
for sent in events_lang_splits["train"]:
    print(sent)
    print(events_lang_splits["train"][sent])
    for event in events_lang_splits["train"][sent]:
        print(event.to_string())
        print("----")

And these
bozos let four armed Cubans land on our shores when they're trying to
make a high terrorist alert
[<__main__.Event object at 0x1a21f155c0>]
Event: { event_id = CNN_CF_20030303.1900.00-EV1mention_id = CNN_CF_20030303.1900.00-EV1-2, type = Movement, subtype = Transport, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = these
bozos let four armed Cubans land on our shores, scope = And these
bozos let four armed Cubans land on our shores when they're trying to
make a high terrorist alert, trigger = land
----
He
lost an election to a dead man
[<__main__.Event object at 0x1a220382b0>]
Event: { event_id = CNN_CF_20030303.1900.00-EV2mention_id = CNN_CF_20030303.1900.00-EV2-1, type = Personnel, subtype = Elect, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = election to a dead man, scope = He
lost an election to a dead man, trigger = election
----
Paul, as I understand your definition of a political -- of a pr

----
Event: { event_id = CNN_IP_20030414.1600.04-EV12mention_id = CNN_IP_20030414.1600.04-EV12-1, type = Contact, subtype = Phone-Write, modality = Other, polarity = Positive, genericity= Generic, tense = Unspecified, extent = they were all doing the most natural thing
during war, trying to reach, scope = On
this one sidewalk in Baghdad, they were all doing the most natural thing
during war, trying to reach their loved ones, trigger = reach
----
And the first to call is the mother
[<__main__.Event object at 0x1a22a2d2b0>]
Event: { event_id = CNN_IP_20030414.1600.04-EV7mention_id = CNN_IP_20030414.1600.04-EV7-1, type = Contact, subtype = Phone-Write, modality = Asserted, polarity = Positive, genericity= Specific, tense = Unspecified, extent = call is the mother, scope = And the first to call is the mother, trigger = call
----
Saba hasn't
delivered yet
[<__main__.Event object at 0x1a22a2d518>]
Event: { event_id = CNN_IP_20030414.1600.04-EV8mention_id = CNN_IP_20030414.1600.04-EV8-1, type

----
Event: { event_id = CNN_ENG_20030507_160538.15-EV4mention_id = CNN_ENG_20030507_160538.15-EV4-1, type = Life, subtype = Die, modality = Asserted, polarity = Negative, genericity= Specific, tense = Unspecified, extent = i'm not paralyzed or dead, scope = i was definitely blessed that i'm not paralyzed or dead, trigger = dead
----
24 hours before
the fall of baghdad, 24 hours before his fellow marines engaged in a
fierce firefight at baghdad university, teeterman's orders had him
just south of the capital taking out iraqi resistance a top a
warehouse roof
[<__main__.Event object at 0x1a2695fe48>, <__main__.Event object at 0x1a2695f5c0>]
Event: { event_id = CNN_ENG_20030507_160538.15-EV5mention_id = CNN_ENG_20030507_160538.15-EV5-1, type = Conflict, subtype = Attack, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = his fellow marines engaged in a
fierce firefight at baghdad university, scope = 24 hours before
the fall of baghdad, 24 hours before 

people, trigger = fight
----
Event: { event_id = fsh_29348-EV2mention_id = fsh_29348-EV2-1, type = Movement, subtype = Transport, modality = Asserted, polarity = Positive, genericity= Generic, tense = Unspecified, extent = going into their
territory, scope = By going into their
territory and building these walls or building settlements -- pushing
the Palestinians out and -- Palestinians -- fight -- with rocks and -- sticks
and -- Israelis have their machine guns and bulldozers and -- I just -- t-
really unpalatable that the -- whole idea, the -- the way they're going
about -- this thing and -- I don't really feel qualified -- to say how it
should be, but -- I -- see it -- being wrong to -- hurt children and -- innocent
people, trigger = going
----
Event: { event_id = fsh_29348-EV6mention_id = fsh_29348-EV6-1, type = Life, subtype = Injure, modality = Asserted, polarity = Positive, genericity= Generic, tense = Unspecified, extent = hurt children and -- innocent
people, scope = By going 

Event: { event_id = APW_ENG_20030510.0228-EV3mention_id = APW_ENG_20030510.0228-EV3-1, type = Conflict, subtype = Attack, modality = Asserted, polarity = Positive, genericity= Specific, tense = Unspecified, extent = the south of the country, where a civil war
has been waged for 20 years, scope = The sentence was the latest in a series of state actions against the
Monitor, the only English language daily in Sudan and a leading
critic of conditions in the south of the country, where a civil war
has been waged for 20 years, trigger = war
----
The court ruling stated that the verdict and sentence would be
submitted to the National Press Council, a government-appointed body,
for approval, Ngor said
[<__main__.Event object at 0x1a26549320>]
Event: { event_id = APW_ENG_20030510.0228-EV4mention_id = APW_ENG_20030510.0228-EV4-2, type = Justice, subtype = Sentence, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = court ruling stated that the verdict and sent

----
Event: { event_id = AGGRESSIVEVOICEDAILY_20041101.1806-EV5mention_id = AGGRESSIVEVOICEDAILY_20041101.1806-EV5-1, type = Life, subtype = Injure, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = the past two years, there have
been cases of schoolchildren and a baby, killed or injured by guns, scope = In the past two years, there have
been cases of schoolchildren and a baby, killed or injured by guns, trigger = injured
----
The sad truth is that while the police officers
are inspecting farmers' gun cabinets to see if they comply with
regulations somewhere in the UK, someone, who has not filled in a
firearms certificate form, will be smuggling a gun into the country or
selling one to an inner city youth
[<__main__.Event object at 0x1a264f6be0>, <__main__.Event object at 0x1a25bee5c0>]
Event: { event_id = AGGRESSIVEVOICEDAILY_20041101.1806-EV6mention_id = AGGRESSIVEVOICEDAILY_20041101.1806-EV6-1, type = Movement, subtype = Transport, modality = Oth

### Example:

In [186]:
sent = "She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum"
for event in events_lang_splits["train"][sent]:
    print(event.scope)
    #print(event.scope_start)
    #print(event.scope_end)

    print("--------")
    print(event.trig_text)
    print(event.trig_start)
    print(event.trig_end)
    print(event.type_)
    print(event.subtype)

    print("-------")
    print(event.scope[event.trig_start:event.trig_end+1])

She will be the Registration
Manager for the Houston Center, accountable for causing registrations into
the Landmark Forum
--------
be
9
10
Personnel
Start-Position
-------
be
She will be the Registration
Manager for the Houston Center, accountable for causing registrations into
the Landmark Forum
--------
Forum
117
121
Contact
Meet
-------
Forum


In [187]:
words_split_dict = {}
for split in events_lang_splits:
    words_split = []
    for sent in events_lang_splits[split]:
        start = 0
        words = []
        for event in events_lang_splits[split][sent]:
            end = event.trig_start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = event.trig_start
            end = event.trig_end+1
            #print(sent[start:end])
            trigger_tok = word_tokenize(sent[start:end])
            flag = True
            for word in trigger_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + event.type_ + ":" + event.subtype))
                else:
                    words.append((word, "I-" + event.type_ + ":" + event.subtype))

            start = event.trig_end + 1
        words_split.append(words)
    words_split_dict.update({split: words_split})

In [188]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/TriggerIdentification/"
for split in words_split_dict:
    print("Split: ", split)
    random.shuffle(words_split_dict[split])
    total = len(words_split_dict[split])
    print("len("+split+"):", len(words_split_dict[split]))
    
    with open(out_path+ "English/"+split+".txt", "w") as file:
        for sent in words_split_dict[split]:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                

Split:  train
len(train): 2690
Split:  dev
len(dev): 269
Split:  test
len(test): 220


In [54]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/TriggerIdentification/"
for lang in ["Chinese", "Arabic"]:
    print("Language: ", lang)
    random.shuffle(words_lang_dict[lang])
    total = len(words_lang_dict[lang])
    train_num = math.floor(total*0.88)
    dev_num = math.floor(total*0.07)
    train = words_lang_dict[lang][:train_num]
    dev = words_lang_dict[lang][train_num:train_num+dev_num]
    test = words_lang_dict[lang][train_num+dev_num:]
    
    print("len(train):", len(train))
    print("len(dev):", len(dev))
    print("len(test):", len(test))
    
    with open(out_path+lang +"/train.txt", "w") as file:
        for sent in train:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/dev.txt", "w") as file:
        for sent in dev:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/test.txt", "w") as file:
        for sent in test:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")

Split:  train
len(train): 2367
len(dev): 188
len(test): 135
Split:  dev
len(train): 236
len(dev): 18
len(test): 15
Split:  test
len(train): 193
len(dev): 15
len(test): 12


In [58]:
words_split_dict["train"][1]

[('I', 'O'),
 ('do', 'O'),
 ("n't", 'O'),
 ('care', 'O'),
 ('what', 'O'),
 ('anyone', 'O'),
 ('says', 'O'),
 ('about', 'O'),
 ('``', 'O'),
 ('you', 'O'),
 ('ca', 'O'),
 ("n't", 'O'),
 ('judge', 'O'),
 ('intent', 'O'),
 ('.', 'O'),
 ("''", 'O'),
 ("Darius'", 'O'),
 ('intent', 'O'),
 ('was', 'O'),
 ('to', 'O'),
 ('dismember', 'O'),
 (',', 'O'),
 ('injure', 'B-Life:Injure'),
 ('dismember', 'B-Life:Injure'),
 (',', 'O'),
 ('injure', 'O'),
 ('or', 'O'),
 ('even', 'O'),
 ('kill', 'B-Life:Die')]

In [59]:
print(words)

[('Earlier', 'O'), ('documents', 'O'), ('in', 'O'), ('the', 'O'), ('case', 'O'), ('have', 'O'), ('included', 'O'), ('embarrassing', 'O'), ('details', 'O'), ('about', 'O'), ('perks', 'O'), ('Welch', 'O'), ('received', 'O'), ('as', 'O'), ('part', 'O'), ('of', 'O'), ('his', 'O'), ('retirement', 'B-Personnel:End-Position')]


## III. BIO Annotation for Arguments: 

In [232]:
new_split_dict = {}
for split in events_lang_splits:
    new_arg = {}
    for sent in events_lang_splits[split].keys():
        new_arguments = []
        events_arg = []
        for event in events_lang_splits[split][sent]:
            events_arg.extend(event.arguments)

        events_arg.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for argument in events_arg:
            if argument.start >= end:
                new_arguments.append(argument)
            end = argument.end

        new_arg.update({sent: new_arguments})
    new_split_dict.update({split: new_arg})

In [233]:
for sent in events_lang_splits["train"].keys():
    for event in events_lang_splits["train"][sent]:
        print(event)
        for arg in event.arguments:
            print(arg.start, " ", arg.end)
        print("--------")
    print("==========")

{'event_id': 'CNN_CF_20030303.1900.00-EV1', 'mention_id': 'CNN_CF_20030303.1900.00-EV1-2', 'type_': 'Movement', 'subtype': 'Transport', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'these\nbozos let four armed Cubans land on our shores', 'extent_start': 1831, 'extent_end': 1882, 'scope': "And these\nbozos let four armed Cubans land on our shores when they're trying to\nmake a high terrorist alert", 'scope_start': 1827, 'scope_end': 1933, 'trig_text': 'land', 'trig_start': 38, 'trig_end': 41, 'arguments': [<__main__.Argument object at 0x1a23938588>, <__main__.Argument object at 0x1a231ca128>, <__main__.Argument object at 0x1a22038e80>], 'entities': [<__main__.Entity object at 0x1a22493c88>, <__main__.Entity object at 0x1a22743c88>, <__main__.Entity object at 0x1a22cad278>, <__main__.Entity object at 0x1a2309eeb8>, <__main__.Entity object at 0x1a2309efd0>, <__main__.Entity object at 0x1a22faa320>]}
20   36
4   14
46   55
--------
{'

{'event_id': 'CNN_IP_20030402.1600.02-1-EV5', 'mention_id': 'CNN_IP_20030402.1600.02-1-EV5-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': 'British forces continue\ntheir attack on targets around Basra', 'extent_start': 392, 'extent_end': 451, 'scope': 'And to the south, British forces continue\ntheir attack on targets around Basra', 'scope_start': 374, 'scope_end': 451, 'trig_text': 'attack', 'trig_start': 48, 'trig_end': 53, 'arguments': [<__main__.Argument object at 0x1a233e6438>, <__main__.Argument object at 0x1a233e6cc0>], 'entities': [<__main__.Entity object at 0x1a233e62e8>, <__main__.Entity object at 0x1a233e6588>, <__main__.Entity object at 0x1a233e6c50>, <__main__.Entity object at 0x1a233e6390>]}
18   31
73   77
--------
{'event_id': 'CNN_IP_20030402.1600.02-1-EV6', 'mention_id': 'CNN_IP_20030402.1600.02-1-EV6-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Ass

75   80
--------
{'event_id': 'CNN_IP_20030407.1600.05-EV17', 'mention_id': 'CNN_IP_20030407.1600.05-EV17-1', 'type_': 'Contact', 'subtype': 'Phone-Write', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Present', 'extent': 'I tell you', 'extent_start': 2654, 'extent_end': 2663, 'scope': 'I tell you all this not to worry\nyou, but to give you comfort knowing what you hear and see is true and\nthat, despite that, I am safe and will continue to be', 'scope_start': 2654, 'scope_end': 2810, 'trig_text': 'tell', 'trig_start': 2, 'trig_end': 5, 'arguments': [<__main__.Argument object at 0x1a26da8ef0>, <__main__.Argument object at 0x1a26da8710>], 'entities': [<__main__.Entity object at 0x1a26a95978>, <__main__.Entity object at 0x1a26a95470>, <__main__.Entity object at 0x1a26a95a90>, <__main__.Entity object at 0x1a26a95f28>, <__main__.Entity object at 0x1a26a95320>, <__main__.Entity object at 0x1a26a95c88>]}
0   0
7   9
--------
{'event_id': 'CNN_IP_20030408

22   31
--------
{'event_id': 'CNNHL_ENG_20030610_133347.6-EV6', 'mention_id': 'CNNHL_ENG_20030610_133347.6-EV6-1', 'type_': 'Justice', 'subtype': 'Fine', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': "he's also being\nfined $3 million and was ordered to pay $1.2 million", 'extent_start': 278, 'extent_end': 345, 'scope': "he's also being\nfined $3 million and was ordered to pay $1.2 million in\nrestitution to the new york state tax commission", 'scope_start': 278, 'scope_end': 397, 'trig_text': 'pay', 'trig_start': 52, 'trig_end': 54, 'arguments': [<__main__.Argument object at 0x1a26f66940>, <__main__.Argument object at 0x1a2227f710>], 'entities': [<__main__.Entity object at 0x1a2227f048>, <__main__.Entity object at 0x1a2227fef0>]}
0   1
56   67
--------
{'event_id': 'CNNHL_ENG_20030610_133347.6-EV4', 'mention_id': 'CNNHL_ENG_20030610_133347.6-EV4-1', 'type_': 'Justice', 'subtype': 'Charge-Indict', 'modality': 'Asserted', 'p

181   194
251   292
96   99
--------
{'event_id': 'CNN_ENG_20030415_173752.0-EV4', 'mention_id': 'CNN_ENG_20030415_173752.0-EV4-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Unspecified', 'extent': 'terrorism', 'extent_start': 869, 'extent_end': 877, 'scope': 'so this is one of the oldest cases\nof terrorism that is one of the oldest cases that has not\nbeen closed to date', 'scope_start': 831, 'scope_end': 942, 'trig_text': 'terrorism', 'trig_start': 38, 'trig_end': 46, 'arguments': [], 'entities': []}
--------
{'event_id': 'CNN_ENG_20030415_173752.0-EV5', 'mention_id': 'CNN_ENG_20030415_173752.0-EV5-1', 'type_': 'Life', 'subtype': 'Die', 'modality': 'Other', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'he is wanted for murder in italy italy', 'extent_start': 647, 'extent_end': 684, 'scope': "this man, abu abbas has lived an i tin\nrant life, he's been in tunas. he's bee

0   6
43   107
--------
{'event_id': 'CNN_ENG_20030528_172957.18-EV4', 'mention_id': 'CNN_ENG_20030528_172957.18-EV4-3', 'type_': 'Justice', 'subtype': 'Release-Parole', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'sentence', 'extent_start': 1875, 'extent_end': 1882, 'scope': "serious' defense attorney who persuaded the judge\nreduce the sentence, told the post and i'm quoting, i'm very saddened.\ni", 'scope_start': 1814, 'scope_end': 1935, 'trig_text': 'sentence', 'trig_start': 61, 'trig_end': 68, 'arguments': [], 'entities': [<__main__.Entity object at 0x1a26e49a58>, <__main__.Entity object at 0x1a26e49908>, <__main__.Entity object at 0x1a269fb5c0>, <__main__.Entity object at 0x1a269fb710>, <__main__.Entity object at 0x1a269fbcc0>, <__main__.Entity object at 0x1a269fb358>, <__main__.Entity object at 0x1a2273dc18>]}
--------
{'event_id': 'CNN_ENG_20030528_172957.18-EV5', 'mention_id': 'CNN_ENG_20030528_172957.18-EV5-1', 'type_':

147   159
--------
{'event_id': 'CNN_ENG_20030625_220123.3-EV6', 'mention_id': 'CNN_ENG_20030625_220123.3-EV6-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Unspecified', 'extent': 'these places with respect defenseless people\nare being attacked', 'extent_start': 2392, 'extent_end': 2454, 'scope': 'is it doing anything to tighten up\nsecurity in these places with respect defenseless people\nare being attacked', 'scope_start': 2345, 'scope_end': 2454, 'trig_text': 'attacked', 'trig_start': 102, 'trig_end': 109, 'arguments': [<__main__.Argument object at 0x1a2684b358>, <__main__.Argument object at 0x1a2684b400>], 'entities': [<__main__.Entity object at 0x1a2684b160>, <__main__.Entity object at 0x1a2684b748>, <__main__.Entity object at 0x1a2684bf60>]}
47   58
73   90
--------
{'event_id': 'CNN_ENG_20030626_193133.8-EV1', 'mention_id': 'CNN_ENG_20030626_193133.8-EV1-1', 'type_': 'Transaction', 'subtype': 'Tr

87   99
--------
{'event_id': 'fsh_29592-EV5', 'mention_id': 'fsh_29592-EV5-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Unspecified', 'extent': "they're fighting", 'extent_start': 3120, 'extent_end': 3135, 'scope': "They're fi- they don't even know why they're fighting anymore", 'scope_start': 3083, 'scope_end': 3143, 'trig_text': 'fighting', 'trig_start': 45, 'trig_end': 52, 'arguments': [<__main__.Argument object at 0x1a217bb198>, <__main__.Argument object at 0x1a217bbc18>], 'entities': [<__main__.Entity object at 0x1a217bbba8>, <__main__.Entity object at 0x1a217bbe10>, <__main__.Entity object at 0x1a217bb6a0>]}
37   40
37   40
--------
{'event_id': 'fsh_29592-EV6', 'mention_id': 'fsh_29592-EV6-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Unspecified', 'extent': 'war', 'extent_start': 3380, 'extent_end': 3382, 'scope'

67   70
--------
{'event_id': 'APW_ENG_20030409.0013-EV1', 'mention_id': 'APW_ENG_20030409.0013-EV1-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Other', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': "U.S. forces came under\n``significant'' fire from those buildings", 'extent_start': 2480, 'extent_end': 2543, 'scope': "U.S. Central Command said it\nfired on the TV office and the hotel after U.S. forces came under\n``significant'' fire from those buildings", 'scope_start': 2408, 'scope_end': 2543, 'trig_text': 'fire', 'trig_start': 111, 'trig_end': 114, 'arguments': [<__main__.Argument object at 0x1a26c56940>, <__main__.Argument object at 0x1a26c56470>], 'entities': [<__main__.Entity object at 0x1a264dce80>, <__main__.Entity object at 0x1a2667ab00>, <__main__.Entity object at 0x1a2667a320>, <__main__.Entity object at 0x1a26935a20>, <__main__.Entity object at 0x1a267d84e0>, <__main__.Entity object at 0x1a21b6cc18>, <__main__.Entity object at 0x

45   73
--------
{'event_id': 'APW_ENG_20030519.0367-EV7', 'mention_id': 'APW_ENG_20030519.0367-EV7-1', 'type_': 'Contact', 'subtype': 'Meet', 'modality': 'Asserted', 'polarity': 'Negative', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'He was to have\ndiscussed his objections with U.S. President George W. Bush this\nweek, but canceled his trip', 'extent_start': 4431, 'extent_end': 4537, 'scope': "He was to have\ndiscussed his objections with U.S. President George W. Bush this\nweek, but canceled his trip after Sunday's bus bombing", 'scope_start': 4431, 'scope_end': 4564, 'trig_text': 'trip', 'trig_start': 103, 'trig_end': 106, 'arguments': [<__main__.Argument object at 0x1a224f9d68>, <__main__.Argument object at 0x1a21b3d4e0>], 'entities': [<__main__.Entity object at 0x1a21b3d9e8>, <__main__.Entity object at 0x1a21d532b0>, <__main__.Entity object at 0x1a21d53898>, <__main__.Entity object at 0x1a21d53128>, <__main__.Entity object at 0x1a21d53eb8>, <__main__.Entity object at 0x

52   58
--------
{'event_id': 'XIN_ENG_20030523.0202-EV3', 'mention_id': 'XIN_ENG_20030523.0202-EV3-1', 'type_': 'Justice', 'subtype': 'Arrest-Jail', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': 'detained for almost two years at the Veterans\nMemorial Medical Center in Metro Manila on plunder', 'extent_start': 1269, 'extent_end': 1364, 'scope': 'Estrada, who was ousted in a military-backed popular revolt in\nJanuary 2001, has been detained for almost two years at the Veterans\nMemorial Medical Center in Metro Manila on plunder charges', 'scope_start': 1183, 'scope_end': 1372, 'trig_text': 'detained', 'trig_start': 86, 'trig_end': 93, 'arguments': [<__main__.Argument object at 0x1a21cda240>, <__main__.Argument object at 0x1a217a0d68>], 'entities': [<__main__.Entity object at 0x1a217a0ba8>, <__main__.Entity object at 0x1a25c19438>, <__main__.Entity object at 0x1a25c19be0>, <__main__.Entity object at 0x1a25c19c18>]}
175   181


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [234]:
new_arg

{"Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down": [<__main__.Argument at 0x1a21d392e8>,
  <__main__.Argument at 0x1a230b0d68>,
  <__main__.Argument at 0x1a230b0358>],
 'Davies is leaving to become chairman of the London School of\nEconomics, one of the best-known parts of the University of London': [<__main__.Argument at 0x1a230b02e8>,
  <__main__.Argument at 0x1a22742a20>],
 'As well as previously holding senior positions at Barclays Bank, BZW\nand Kleinwort Benson, McCarthy was formerly a top civil servant at\nthe Department of Trade and Industry': [<__main__.Argument at 0x1a22742438>,
  <__main__.Argument at 0x1a227426a0>,
  <__main__.Argument at 0x1a22742470>,
  <__main__.Argument at 0x1a22742080>,
  <__main__.Argument at 0x1a2372b550>],
 "British Chancellor of the Exchequer Gordon Brown on Tuesday named the\ncurrent head of the country's energy regulator as the 

In [235]:
sent = "Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down"
print(new_arg[sent][1].start, " ", new_arg[sent][1].end," " ,new_arg[sent][1].role)

52   109   Position


In [236]:
len(events_lang_splits["dev"])

269

In [237]:
len(events_lang_splits["train"])

2690

In [238]:
len(events_lang_splits["test"])

220

In [239]:
arguments_split_dict = {}
for split in events_lang_splits:
    print("Processing split => ", split)
    words_split = []
    for sent in tqdm(events_lang_splits[split]):
        words = []
        for argument in new_split_dict[split][sent]:
            start = 0
            end = argument.start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])
            #print(sent[start:end])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = argument.start
            end = argument.end+1
            #print(sent[start:end])
            arg_tok = word_tokenize(sent[start:end])
            flag = True
            for word in arg_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + argument.role))
                else:
                    words.append((word, "I-" + argument.role))

            start = argument.end + 1
        words_split.append(words)
    arguments_split_dict.update({split: words_split})

 12%|█▏        | 312/2690 [00:00<00:00, 3105.16it/s]

Processing split =>  train


100%|██████████| 2690/2690 [00:00<00:00, 2718.88it/s]
100%|██████████| 269/269 [00:00<00:00, 2473.25it/s]
  0%|          | 0/220 [00:00<?, ?it/s]

Processing split =>  dev
Processing split =>  test


100%|██████████| 220/220 [00:00<00:00, 2012.75it/s]


In [240]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/ArgumentIdentification/"
for split in arguments_split_dict:
    print("Split: ", split)
    random.shuffle(arguments_split_dict[split])
    total = len(arguments_split_dict[split])
    print("len(train):", len(arguments_split_dict[split]))
    
    with open(out_path+ "English/" + split + ".txt", "w") as file:
        for sent in arguments_split_dict[split]:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                

Split:  train
len(train): 2690
Split:  dev
len(train): 269
Split:  test
len(train): 220


## New Format:

In [242]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/tagging/ArgumentIdentification/"
for split in arguments_split_dict:
    print("Split: ", split)
    random.shuffle(arguments_split_dict[split])
    total = len(arguments_split_dict[split])
    print("len("+split+"):", len(arguments_split_dict[split]))
    
    with open(out_path+ "English/" + split + ".words.txt", "w") as file:
        for sent in arguments_split_dict[split]:
            for word, _ in sent:
                file.write(word+ " ")
            file.write("\n")
            
    with open(out_path+ "English/" + split + ".tags.txt", "w") as file:
        for sent in arguments_split_dict[split]:
            for _, ann in sent:
                file.write(ann+ " ")
            file.write("\n")

Split:  train
len(train): 2690
Split:  dev
len(dev): 269
Split:  test
len(test): 220


### Other languages:

In [203]:
new_lang_arg_dict = {}
for lang in events_list_lang:
    new_arg = {}
    for sent in events_list_lang[lang].keys():
        new_arguments = []
        events_arg = []
        for event in events_list_lang[lang][sent]:
            events_arg.extend(event.arguments)

        events_arg.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for argument in events_arg:
            if argument.start >= end:
                new_arguments.append(argument)
            end = argument.end

        new_arg.update({sent: new_arguments})
    new_lang_arg_dict.update({lang: new_arg})

In [204]:
arguments_lang_dict = {}
for lang in events_list_lang:
    print("Processing Language => ", lang)
    words_lang = []
    for sent in tqdm(events_list_lang[lang]):
        words = []
        for argument in new_lang_arg_dict[lang][sent]:
            start = 0
            end = argument.start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])
            #print(sent[start:end])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = argument.start
            end = argument.end+1
            #print(sent[start:end])
            arg_tok = word_tokenize(sent[start:end])
            flag = True
            for word in arg_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + argument.role))
                else:
                    words.append((word, "I-" + argument.role))

            start = argument.end + 1
        words_lang.append(words)
    arguments_lang_dict.update({lang: words_lang})

 17%|█▋        | 545/3177 [00:00<00:01, 2484.31it/s]

Processing Language =>  English


100%|██████████| 3177/3177 [00:01<00:00, 2508.75it/s]
  9%|▉         | 177/1931 [00:00<00:00, 1765.94it/s]

Processing Language =>  Chinese


100%|██████████| 1931/1931 [00:00<00:00, 2583.01it/s]
 14%|█▍        | 231/1650 [00:00<00:00, 2301.28it/s]

Processing Language =>  Arabic


100%|██████████| 1650/1650 [00:01<00:00, 1324.07it/s]


In [205]:
#out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/ArgumentExtraction/"
#out_path = "/Users/d22admin/Desktop/vista_cluster/nas/home/meryem/sequence_tagging/data/ACE05/ArgumentExtraction/"
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/ArgumentIdentification/"
for lang in ["English", "Chinese", "Arabic"]:
    random.shuffle(arguments_lang_dict[lang])
    total = len(arguments_lang_dict[lang])
    train_num = math.floor(total*0.88)
    dev_num = math.floor(total*0.07)
    train = arguments_lang_dict[lang][:train_num]
    dev = arguments_lang_dict[lang][train_num:train_num+dev_num]
    test = arguments_lang_dict[lang][train_num+dev_num:]
    
    print("len(train):", len(train))
    print("len(dev):", len(dev))
    print("len(test):", len(test))
    
    with open(out_path+lang +"/train.txt", "w") as file:
        for sent in train:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/dev.txt", "w") as file:
        for sent in dev:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/test.txt", "w") as file:
        for sent in test:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")

len(train): 2795
len(dev): 222
len(test): 160
len(train): 1699
len(dev): 135
len(test): 97
len(train): 1452
len(dev): 115
len(test): 83


### Checking the number of Sentences:

def extract_from_xml_check(root_path, language, domain):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    scopes = {}
    events_num = 0
    if language == "English":
        subpath = "timex2norm"
    else:
        subpath = "adj"
    for file_name in sorted(glob.glob(root_path + language + "/" + domain + "/"+subpath+"/*.apf.xml")):  #in [root_path + language + "/bc/adj/CNN_CF_20030303.1900.00.apf.xml"]:       
        # Get the event + argument annotation
        print("file_name=", file_name)
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
    
        events_num += len(list(root.iter('event')))
        for event in root.iter('event'):
            for mention in event.iter('event_mention'):
                for child in mention:
                    if child.tag == "ldc_scope":
                        for chil2 in child:
                            scope = chil2.text
                            scope_start = int(chil2.attrib["START"])
                            scope_end = int(chil2.attrib["END"])
                            scopes.update({scope: "scope"})
                            
    print("Number of scopes per domain: ", domain, " is: ", len(scopes))
    print("Number of events: ", events_num)
        
    return scopes, files_processed, events_num

scopes_list_lang = {}

for language in languages:
    print("Processing language: ", language)
    files_num = 0
    events_num_total = 0
    domains = [file_.split("/")[-1] for file_ in glob.glob(root_path + language + "/*" ) if "Icon\r" not in file_]
    scopes_lang = {}
    for domain in domains:
        scopes, files_processed, events_num = extract_from_xml_check(root_path, language, domain)
        files_num += files_processed
        events_num_total += events_num
        
        scopes_lang = merge_two_dicts(scopes_lang, scopes)
    
    print("Number of files processed for language= ", language, " is= ", files_num)
    print("Number of events for language= ", events_num_total)
    
    scopes_list_lang.update({language: scopes_lang})