In [12]:
import glob
import xml.etree.ElementTree as ET
import re
import json
from nltk.tokenize import word_tokenize
import random
import math
from tqdm import tqdm

## Class Definitions:

In [3]:
class Argument(object):
    def __init__(self, id_, text, role, start, end):
        self.id_ = id_
        self.text = text
        self.role = role
        self.start = start
        self.end = end

    def to_string(self):
        return "Argument: {id_ = " + self.id_ + ", text = " + self.text + ", role = " + self.role + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [4]:
class Trigger(object):
    def __init__(self, start, text, end, id_, event_type):
        self.start = start
        self.text = text
        self.end = end
        self.id_ = id_
        self.event_type = event_type
        
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [5]:
class Entity(object):
    def __init__(self, id_, text, entity_type, phrase_type, start, end):
        self.id_ = id_
        self.text = text
        self.entity_type = entity_type
        self.phrase_type = phrase_type
        self.start = start
        self.end = end

    def to_string(self):
        return "Entity: {id_ = " + self.id_ + ", text = " + self.text + ", entity_type = " + self.entity_type + ", phrase_type=" + self.phrase_type + ", start =" + str(self.start) + ", end =" + str(self.end) + "}"
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [6]:
class Sentence(object):
    def __init__(self, text, start, end):
        self.text = text
        self.start = start
        self.end = end

    def to_string(self):
        return "Sentence: {text = " + self.text + ", start = " + self.start + ", end = " + self.end + "}"
    
    def __str__(self):
        return str(self.__dict__)

    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

In [7]:
class Event(object):
    def __init__(self, event_id, mention_id, type_, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end, scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities):
        self.event_id = event_id
        self.mention_id = mention_id
        self.type_ = type_
        self.subtype = subtype
        self.modality = modality
        self.polarity = polarity
        self.genericity = genericity
        self.tense = tense
        self.extent = extent
        self.extent_start = extent_start
        self.extent_end = extent_end
        self.scope = scope
        self.scope_start = scope_start
        self.scope_end = scope_end
        self.trig_text = trig_text
        self.trig_start = trig_start
        self.trig_end = trig_end
        self.arguments = arguments
        self.entities = entities


    def to_string(self):
        return "Event: { event_id = " + self.event_id + "mention_id = " + self.mention_id + ", type = " + self.type_ + ", subtype = " +self.subtype + ", modality = " \
               + self.modality + ", polarity = " + self.polarity + ", genericity= " + self.genericity + ", tense = " +\
               self.tense + ", extent = " +self.extent + ", scope = " + self.scope  + ", trigger = " + self.trig_text
    
    def __str__(self):
        return str(self.__dict__)
    
    def __eq__(self, other): 
        return self.__dict__ == other.__dict__
    
    def __hash__(self):
        return hash(self)

## I. Annotation/Offset Extraction:

In [8]:
def extract_entity_info(entity, scope_start, scope_end):
    entity_id = entity.attrib["ID"]
    phrase_type = entity.attrib["TYPE"] + ":" + entity.attrib["SUBTYPE"]
    entity_class = entity.attrib["CLASS"]

    entities = []
    for mention in entity.iter('entity_mention'):
        entity_type = mention.attrib["LDCTYPE"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    text = chil2.text
                    start = int(chil2.attrib["START"])
                    end = int(chil2.attrib["END"])
                    
        if scope_start <= start and  scope_end >= end:      
            ent = Entity(entity_id, text, entity_type, phrase_type, start - scope_start, end - scope_start)
            entities.append(ent)
        
    return entities

In [9]:
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

In [10]:
def extract_event_info(root, event):
    event_id = event.attrib["ID"]
    event_type = event.attrib["TYPE"]
    subtype = event.attrib["SUBTYPE"]
    modality = event.attrib["MODALITY"]
    polarity = event.attrib["POLARITY"]
    genericity = event.attrib["GENERICITY"]
    tense = event.attrib["TENSE"]

    ## Looking at event mentions
    for mention in event.iter('event_mention'):
        mention_id = mention.attrib["ID"]
        for child in mention:
            if child.tag == "extent":
                for chil2 in child:
                    extent = chil2.text
                    extent_start = int(chil2.attrib["START"])
                    extent_end = int(chil2.attrib["END"]) 
                
            ## SCOPE USED AS SENTENCE
            elif child.tag == "ldc_scope":
                for chil2 in child:
                    scope = chil2.text
                    scope_start = int(chil2.attrib["START"])
                    scope_end = int(chil2.attrib["END"])
                sent = Sentence(scope, scope_start, scope_end)
                
            ## TRIGGER EXTRACTION
            elif child.tag == "anchor":
                for chil2 in child:
                    trig_text = chil2.text
                    trig_start = int(chil2.attrib["START"]) - scope_start
                    trig_end = int(chil2.attrib["END"]) - scope_start
        

        arguments = []           
        for argument in mention.iter('event_mention_argument'):
            arg_id = argument.attrib["REFID"]
            role = argument.attrib["ROLE"]
            for child in argument:
                for chil2 in child:
                    arg_text = chil2.text
                    arg_start = int(chil2.attrib["START"]) - scope_start
                    arg_end = int(chil2.attrib["END"]) - scope_start
                        
            arg = Argument(arg_id, arg_text, +":"+role, arg_start, arg_end) 

            arguments.append(arg)
       
        ## Looking at entity mentions with that same event
        entities = []
        for entity in root.iter('entity'):
            entities.extend(extract_entity_info(entity, scope_start, scope_end))

    ev = Event(event_id, mention_id, event_type, subtype, modality, polarity, genericity, tense, extent, extent_start, extent_end,
               scope, scope_start, scope_end, trig_text, trig_start, trig_end, arguments, entities)

    return sent, ev

In [24]:
def extract_from_xml(root_path, language, domain):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    if language == "English":
        subpath = "timex2norm"
    else:
        subpath = "adj"
        
    for file_name in sorted(glob.glob(root_path + language + "/" + domain + "/"+subpath+"/*.apf.xml")):        
        # Get the event + argument annotation
        #print("file_name=", file_name)
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        for event in root.iter('event'):
            sent, ev = extract_event_info(root, event)
            if sent.text not in events:
                events.update({sent.text: [ev]}) 
            else:
                ev_list = events[sent.text]
                ev_list.append(ev)
                events.update({sent.text: ev_list})
                
        
    return events, files_processed

In [25]:
def extract_from_xml_english(root_path, files):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    for div in tqdm(files): #       
        # Get the event + argument annotation
        file_name = root_path + "English/"+ div.strip("\n") +".apf.xml"
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
        
        for event in root.iter('event'):
            sent, ev = extract_event_info(root, event)
            if sent.text not in events:
                events.update({sent.text: [ev]}) 
            else:
                ev_list = events[sent.text]
                ev_list.append(ev)
                events.update({sent.text: ev_list})
    return events, files_processed

In [26]:
#root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/LDC2006T06/data/"
root_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/3Datasets/EventsExtraction/ACE/Raw/ACE2005-TrainingData-V6.0/"



In [27]:
languages = [file_.split("/")[-1] for file_ in glob.glob(root_path + "*") if "Icon\r" not in file_]


### Executing over the domains/languages:

In [28]:
files_splits = {}
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_training") as file:
    files_splits.update({"train": file.readlines()})

with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_dev") as file:
    files_splits.update({"dev": file.readlines()})
    
with open("/Users/d22admin/Documents/JMEE/qi_filelist/new_filelist_ACE_test") as file:
    files_splits.update({"test": file.readlines()})

In [41]:
events_lang_splits = {}
events_list_lang = {}
for language in ["English", "Chinese", "Arabic"]:
    print("Processing language: ", language)
    if language == "English":
        for split in ["train", "dev", "test"]:
            print("Processing train split")
            events, files_processed = extract_from_xml_english(root_path, files_splits[split])
            events_lang_splits.update({split: events})

            print("Number of files processed for language= ", language, " is= ", files_processed)
    else:
        domains = [file_.split("/")[-1] for file_ in glob.glob(root_path + language + "/*" ) if "Icon\r" not in file_]
        events_lang = {}
        for domain in domains:
            events, files_processed = extract_from_xml(root_path, language, domain)

            events_lang = merge_two_dicts(events_lang, events)
        events_list_lang.update({language: events_lang})

  1%|          | 6/529 [00:00<00:10, 48.33it/s]

Processing language:  English
Processing train split


100%|██████████| 529/529 [00:03<00:00, 173.24it/s]
 27%|██▋       | 8/30 [00:00<00:00, 54.07it/s]

Number of files processed for language=  English  is=  529
Processing train split


100%|██████████| 30/30 [00:00<00:00, 94.59it/s]
 22%|██▎       | 9/40 [00:00<00:00, 86.37it/s]

Number of files processed for language=  English  is=  30
Processing train split


100%|██████████| 40/40 [00:00<00:00, 119.33it/s]


Number of files processed for language=  English  is=  40
Processing language:  Chinese
Processing language:  Arabic


In [43]:
len(events_lang_splits["train"])

2690

In [44]:
len(events_lang_splits["dev"])

269

In [45]:
len(events_lang_splits["test"])

220

In [39]:
len(events_list_lang["Chinese"])

1931

In [40]:
len(events_list_lang["Arabic"])

1650

## I. Preparing new data for JMEE:

### Entities with words:

In [22]:
new_ent_lang_dict = {}
for lang in languages:
    new_ent = {}
    for sent in events_list_lang[lang].keys():
        new_entities = []
        events_ent = []
        for event in events_list_lang[lang][sent]:
            events_ent.extend(event.entities)

        events_ent.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for entity in events_ent:
            if entity.start >= end:
                new_entities.append(entity)
            end = entity.end
            #print(argument.start, " ", argument.end, " ", argument.role)

        new_ent.update({sent: new_entities})
    new_ent_lang_dict.update({lang: new_ent})

In [31]:
def get_entities_positions(sent, language, new_ent_lang_dict):
    words = {}
    entities = []
    i = 0
    start = 0
    for entity in new_ent_lang_dict[language][sent]:
        end = entity.start

        for word in word_tokenize(sent[start:end]):
            words.update({i: word})
            i += 1

        ### Tokenize trigger part and annotate each word as 'B' or 'I'
        start = entity.start
        end = entity.end+1 
        print("start=", start)
        entity_tok = word_tokenize(sent[start:end])
        print("Entity text= ", sent[start:end])
        pos_start = i
        print("start i =", i)
        for word in entity_tok:
            print("word:", word)
            words.update({i:word})
            i += 1
        pos_end = i
        
        print("i=", i)
        
        entities.append({"phrase-type": entity.phrase_type, "end": pos_end, "text": entity.text, "entity-type": entity.entity_type, "start": pos_start, " id": entity.id_})
        start = entity.end + 1
        
    for word in word_tokenize(sent[start:]):
        words.update({i: word})
        i += 1
        
    return words, entities, triggers

In [32]:
sent = 'She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum'
words, entities = get_entities_positions(sent, "English", new_ent_lang_dict)

start= 0
Entity text=  She
start i = 0
word: She
i= 1
start= 12
Entity text=  the Registration
Manager for the Houston Center
start i = 3
word: the
word: Registration
word: Manager
word: for
word: the
word: Houston
word: Center
i= 10
start= 108
Entity text=  Landmark
start i = 17
word: Landmark
i= 18


In [33]:
list(new_ent_lang_dict["English"].keys())[0]

'She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum'

In [34]:
words

{0: 'She',
 1: 'will',
 2: 'be',
 3: 'the',
 4: 'Registration',
 5: 'Manager',
 6: 'for',
 7: 'the',
 8: 'Houston',
 9: 'Center',
 10: ',',
 11: 'accountable',
 12: 'for',
 13: 'causing',
 14: 'registrations',
 15: 'into',
 16: 'the',
 17: 'Landmark',
 18: 'Forum'}

In [35]:
entities

[{'phrase-type': 'PER:Individual',
  'end': 1,
  'text': 'She',
  'entity-type': 'PRO',
  'start': 0,
  ' id': 'Austin-Grad-Community_20050212.2454-E31'},
 {'phrase-type': 'PER:Individual',
  'end': 10,
  'text': 'the Registration\nManager for the Houston Center',
  'entity-type': 'NOM',
  'start': 3,
  ' id': 'Austin-Grad-Community_20050212.2454-E31'},
 {'phrase-type': 'ORG:Educational',
  'end': 18,
  'text': 'Landmark',
  'entity-type': 'NAMPRE',
  'start': 17,
  ' id': 'Austin-Grad-Community_20050212.2454-E2'}]

### Triggers with words:

In [46]:
events_list_lang["English"][sent][1].arguments

[<__main__.Argument at 0x1a1a280e80>, <__main__.Argument at 0x1a1a280940>]

In [55]:
new_trig_lang_dict = {}
for lang in languages:
    new_trig = {}
    for sent in events_list_lang[lang].keys():
        triggers = []
        new_triggers = []
        for event in events_list_lang[lang][sent]:
            triggers.append(Trigger(event.trig_start, event.trig_text, event.trig_end, event.event_id, event.type_+":"+event.subtype))
        triggers.sort(key=lambda x: x.start, reverse=False)
        end = 0
        for trig in triggers:
            if trig.start >= end:
                new_triggers.append(trig)
            end = trig.end
        new_trig.update({sent: new_triggers})
    new_trig_lang_dict.update({language: new_trig})

In [None]:
new_arg_lang_dict = {}
for lang in languages:
    for sent in events_list_lang[lang].keys():
        new_event_arg = []
        for event in events_list_lang[lang][sent]:
            event_args = event.arguments
            event_args.sort(key=lambda x: x.start, reverse=False)
            
            end = 0
            new_arguments = []
            for argument in event_args:
                if argument.start >= end:
                    new_arguments.append(argument)
                end = argument.end
            
            new_event_arg.append({"arguments": new_arguments, })

In [None]:
new_lang_dict = {}
for lang in languages:
    new_arg = {}
    for sent in events_list_lang[lang].keys():
        new_arguments = []
        events_arg = []
        for event in events_list_lang[lang][sent]:
            events_arg.extend(event.arguments)

        events_arg.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for argument in events_arg:
            if argument.start >= end:
                new_arguments.append(argument)
            end = argument.end
            #print(argument.start, " ", argument.end, " ", argument.role)

        new_arg.update({sent: new_arguments})
    new_lang_dict.update({lang: new_arg})

## II. BIO Annotation for Triggers:

In [47]:
for sent in events_lang_splits["train"]:
    print(sent)
    print(events_lang_splits["train"][sent])
    for event in events_lang_splits["train"][sent]:
        print(event.to_string())
        print("----")

And these
bozos let four armed Cubans land on our shores when they're trying to
make a high terrorist alert
[<__main__.Event object at 0x1a22cddcc0>]
Event: { event_id = CNN_CF_20030303.1900.00-EV1mention_id = CNN_CF_20030303.1900.00-EV1-2, type = Movement, subtype = Transport, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = these
bozos let four armed Cubans land on our shores, scope = And these
bozos let four armed Cubans land on our shores when they're trying to
make a high terrorist alert, trigger = land
----
He
lost an election to a dead man
[<__main__.Event object at 0x1a228d1128>]
Event: { event_id = CNN_CF_20030303.1900.00-EV2mention_id = CNN_CF_20030303.1900.00-EV2-1, type = Personnel, subtype = Elect, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = election to a dead man, scope = He
lost an election to a dead man, trigger = election
----
Paul, as I understand your definition of a political -- of a pr

----
Event: { event_id = CNN_IP_20030408.1600.03-EV17mention_id = CNN_IP_20030408.1600.03-EV17-1, type = Movement, subtype = Transport, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = the president having flown back to the
United States from that meeting in Northern Ireland in Belfast, scope = And, as we listened to John King's report, we can tell you that President
Bush has just -- his plane has just landed at Andrews Air Force Base,
just close to the Washington area, the president having flown back to the
United States from that meeting in Northern Ireland in Belfast with the
British prime minister, Tony Blair, trigger = flown
----
But they also discussed the Middle East and, as well, getting the
Northern Ireland peace process started up again
[<__main__.Event object at 0x1a224cfd30>]
Event: { event_id = CNN_IP_20030408.1600.03-EV18mention_id = CNN_IP_20030408.1600.03-EV18-3, type = Contact, subtype = Meet, modality = Asserted, polarity = Positi

[<__main__.Event object at 0x1a21b7dc18>]
Event: { event_id = CNN_ENG_20030429_110706.7-EV1mention_id = CNN_ENG_20030429_110706.7-EV1-1, type = Movement, subtype = Transport, modality = Other, polarity = Positive, genericity= Specific, tense = Future, extent = ari fleischer expected to come into the west wing briefing room, scope = ari fleischer expected to come into the west wing briefing room to
brief reporters on a wide variety of subjekts, trigger = come
----
about 40 miles or so outside baghdad in
fallujah where there was h a shootout between u.s. troops and iraqies
who had been demonstrating there
[<__main__.Event object at 0x1a23af3b70>, <__main__.Event object at 0x1a23af3a20>]
Event: { event_id = CNN_ENG_20030429_110706.7-EV2mention_id = CNN_ENG_20030429_110706.7-EV2-1, type = Conflict, subtype = Attack, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = fallujah where there was h a shootout between u.s. troops and iraqies
who had been demons

to the, ah, nine-eleven tragedy and terrorism and this kind of stuff, trigger = tragedy
----
In fact, we just got back from Disney World,  you know, about an hour
ago and they're down there and they're but they're both, uh
[<__main__.Event object at 0x1a21ccaf60>]
Event: { event_id = fsh_29521-EV1mention_id = fsh_29521-EV1-1, type = Movement, subtype = Transport, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = we just got back from Disney World,  you know, about an hour
ago, scope = In fact, we just got back from Disney World,  you know, about an hour
ago and they're down there and they're but they're both, uh, trigger = got back
----
and, um,  but then -- Edwards fought back kind of -- you know and then all
the stuff you hear about Edwards too, about the, uh,  how he's a trial
lawyer and how he made all his millions -- suing insurance companies
and, um, you know, there- there's -- bad stuff you hear about both of
them that, you know, obviously th

Event: { event_id = NYT_ENG_20030403.0008-EV24mention_id = NYT_ENG_20030403.0008-EV24-1, type = Conflict, subtype = Attack, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = Bush invaded, scope = But he also said, ``He may destroy them to embarrass us,'' to show
the world he never had such weapons and that Bush invaded for no
reason other than aggression, trigger = invaded
----
But there
would be lots of casualties on all sides
[<__main__.Event object at 0x1a22660c50>]
Event: { event_id = NYT_ENG_20030403.0008-EV25mention_id = NYT_ENG_20030403.0008-EV25-1, type = Life, subtype = Die, modality = Other, polarity = Positive, genericity= Generic, tense = Unspecified, extent = casualties, scope = But there
would be lots of casualties on all sides, trigger = casualties
----
They are likely to mount small probing patrols into the
city to determine the level of Iraqi resistance
[<__main__.Event object at 0x1a22660550>]
Event: { event_id = NYT_ENG_20030403.0

----
Deputy governor of Diyala along with several council members from
Ba'quba were ambushed and killed in Latifiya south of Baghdad, an area
which has supposedly been 'cleared' from insurgents a couple of weeks
ago
[<__main__.Event object at 0x1a22a3e748>, <__main__.Event object at 0x1a23f514a8>]
Event: { event_id = HEALINGIRAQ_20041108.1942.05-EV3mention_id = HEALINGIRAQ_20041108.1942.05-EV3-1, type = Conflict, subtype = Attack, modality = Asserted, polarity = Positive, genericity= Specific, tense = Past, extent = Deputy governor of Diyala along with several council members from
Ba'quba were ambushed and killed in Latifiya, scope = Deputy governor of Diyala along with several council members from
Ba'quba were ambushed and killed in Latifiya south of Baghdad, an area
which has supposedly been 'cleared' from insurgents a couple of weeks
ago, trigger = ambushed
----
Event: { event_id = HEALINGIRAQ_20041108.1942.05-EV4mention_id = HEALINGIRAQ_20041108.1942.05-EV4-1, type = Life, subtype 

### Example:

In [49]:
sent = "She will be the Registration\nManager for the Houston Center, accountable for causing registrations into\nthe Landmark Forum"
for event in events_lang_splits["train"][sent]:
    print(event.scope)
    #print(event.scope_start)
    #print(event.scope_end)

    print("--------")
    print(event.trig_text)
    print(event.trig_start)
    print(event.trig_end)
    print(event.type_)
    print(event.subtype)

    print("-------")
    print(event.scope[event.trig_start:event.trig_end+1])

She will be the Registration
Manager for the Houston Center, accountable for causing registrations into
the Landmark Forum
--------
be
9
10
Personnel
Start-Position
-------
be
She will be the Registration
Manager for the Houston Center, accountable for causing registrations into
the Landmark Forum
--------
Forum
117
121
Contact
Meet
-------
Forum


In [53]:
words_split_dict = {}
for split in events_lang_splits:
    words_split = []
    for sent in events_lang_splits[split]:
        start = 0
        words = []
        for event in events_lang_splits[split][sent]:
            end = event.trig_start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = event.trig_start
            end = event.trig_end+1
            #print(sent[start:end])
            trigger_tok = word_tokenize(sent[start:end])
            flag = True
            for word in trigger_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + event.type_ + ":" + event.subtype))
                else:
                    words.append((word, "I-" + event.type_ + ":" + event.subtype))

            start = event.trig_end + 1
        words_split.append(words)
    words_split_dict.update({split: words_split})

In [80]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/TriggerIdentification/"
for split in words_split_dict:
    print("Split: ", split)
    random.shuffle(words_split_dict[split])
    total = len(words_split_dict[split])
    print("len(train):", len(words_split_dict[split]))
    
    with open(out_path+ "English/split.txt", "w") as file:
        for sent in words_split_dict[split]:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                

Split:  train
len(train): 2690
Split:  dev
len(train): 269
Split:  test
len(train): 220


In [54]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/"
for lang in ["English", "Chinese", "Arabic"]:
    print("Language: ", lang)
    random.shuffle(words_lang_dict[lang])
    total = len(words_lang_dict[lang])
    train_num = math.floor(total*0.88)
    dev_num = math.floor(total*0.07)
    train = words_lang_dict[lang][:train_num]
    dev = words_lang_dict[lang][train_num:train_num+dev_num]
    test = words_lang_dict[lang][train_num+dev_num:]
    
    print("len(train):", len(train))
    print("len(dev):", len(dev))
    print("len(test):", len(test))
    
    with open(out_path+lang +"/train.txt", "w") as file:
        for sent in train:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/dev.txt", "w") as file:
        for sent in dev:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/test.txt", "w") as file:
        for sent in test:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")

Split:  train
len(train): 2367
len(dev): 188
len(test): 135
Split:  dev
len(train): 236
len(dev): 18
len(test): 15
Split:  test
len(train): 193
len(dev): 15
len(test): 12


In [58]:
words_split_dict["train"][1]

[('I', 'O'),
 ('do', 'O'),
 ("n't", 'O'),
 ('care', 'O'),
 ('what', 'O'),
 ('anyone', 'O'),
 ('says', 'O'),
 ('about', 'O'),
 ('``', 'O'),
 ('you', 'O'),
 ('ca', 'O'),
 ("n't", 'O'),
 ('judge', 'O'),
 ('intent', 'O'),
 ('.', 'O'),
 ("''", 'O'),
 ("Darius'", 'O'),
 ('intent', 'O'),
 ('was', 'O'),
 ('to', 'O'),
 ('dismember', 'O'),
 (',', 'O'),
 ('injure', 'B-Life:Injure'),
 ('dismember', 'B-Life:Injure'),
 (',', 'O'),
 ('injure', 'O'),
 ('or', 'O'),
 ('even', 'O'),
 ('kill', 'B-Life:Die')]

In [59]:
print(words)

[('Earlier', 'O'), ('documents', 'O'), ('in', 'O'), ('the', 'O'), ('case', 'O'), ('have', 'O'), ('included', 'O'), ('embarrassing', 'O'), ('details', 'O'), ('about', 'O'), ('perks', 'O'), ('Welch', 'O'), ('received', 'O'), ('as', 'O'), ('part', 'O'), ('of', 'O'), ('his', 'O'), ('retirement', 'B-Personnel:End-Position')]


In [219]:
events_list_lang["English"].keys()



## III. BIO Annotation for Arguments: 

In [None]:
words_split_dict = {}
for split in events_lang_splits:
    words_split = []
    for sent in events_lang_splits[split]:
        start = 0
        words = []
        for event in events_lang_splits[split][sent]:
            end = event.trig_start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = event.trig_start
            end = event.trig_end+1
            #print(sent[start:end])
            trigger_tok = word_tokenize(sent[start:end])
            flag = True
            for word in trigger_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + event.type_ + ":" + event.subtype))
                else:
                    words.append((word, "I-" + event.type_ + ":" + event.subtype))

            start = event.trig_end + 1
        words_split.append(words)
    words_split_dict.update({split: words_split})

In [61]:
new_split_dict = {}
for split in events_lang_splits:
    new_arg = {}
    for sent in events_lang_splits[split].keys():
        new_arguments = []
        events_arg = []
        for event in events_lang_splits[split][sent]:
            events_arg.extend(event.arguments)

        events_arg.sort(key=lambda x: x.start, reverse=False)

        end = 0
        for argument in events_arg:
            if argument.start >= end:
                new_arguments.append(argument)
            end = argument.end

        new_arg.update({sent: new_arguments})
    new_split_dict.update({split: new_arg})

In [65]:
for sent in events_lang_splits["train"].keys():
    for event in events_lang_splits["train"][sent]:
        print(event)
        for arg in event.arguments:
            print(arg.start, " ", arg.end)
        print("--------")
    print("==========")

{'event_id': 'CNN_CF_20030303.1900.00-EV1', 'mention_id': 'CNN_CF_20030303.1900.00-EV1-2', 'type_': 'Movement', 'subtype': 'Transport', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'these\nbozos let four armed Cubans land on our shores', 'extent_start': 1831, 'extent_end': 1882, 'scope': "And these\nbozos let four armed Cubans land on our shores when they're trying to\nmake a high terrorist alert", 'scope_start': 1827, 'scope_end': 1933, 'trig_text': 'land', 'trig_start': 38, 'trig_end': 41, 'arguments': [<__main__.Argument object at 0x1a23222080>, <__main__.Argument object at 0x1a22a16748>, <__main__.Argument object at 0x1a226a3b00>], 'entities': [<__main__.Entity object at 0x1a22ba29e8>, <__main__.Entity object at 0x1a22bcf630>, <__main__.Entity object at 0x1a232d5c88>, <__main__.Entity object at 0x1a239388d0>, <__main__.Entity object at 0x1a23938898>, <__main__.Entity object at 0x1a22a57a58>]}
20   36
4   14
46   55
--------
{'

23   54
96   116
67   89
0   20
--------
{'event_id': 'CNN_IP_20030402.1600.00-2-EV21', 'mention_id': 'CNN_IP_20030402.1600.00-2-EV21-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'the U.S. 101st Airborne Division has driven Iraqi\nFedayeen fighters from the key city of Najaf, pushed north in a\nseveral-pronged attack', 'extent_start': 3303, 'extent_end': 3438, 'scope': 'In less than 24 hours, the U.S. 101st Airborne Division has driven Iraqi\nFedayeen fighters from the key city of Najaf, pushed north in a\nseveral-pronged attack', 'scope_start': 3280, 'scope_end': 3438, 'trig_text': 'attack', 'trig_start': 153, 'trig_end': 158, 'arguments': [<__main__.Argument object at 0x1a222c00b8>], 'entities': [<__main__.Entity object at 0x1a222ae390>, <__main__.Entity object at 0x1a222ae828>, <__main__.Entity object at 0x1a222ae080>, <__main__.Entity object at 0x1a222ae780>, <__main__.Entity objec

0   1
--------
{'event_id': 'CNN_IP_20030408.1600.03-EV6', 'mention_id': 'CNN_IP_20030408.1600.03-EV6-2', 'type_': 'Life', 'subtype': 'Die', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'People died', 'extent_start': 1566, 'extent_end': 1576, 'scope': 'People died', 'scope_start': 1566, 'scope_end': 1576, 'trig_text': 'died', 'trig_start': 7, 'trig_end': 10, 'arguments': [<__main__.Argument object at 0x1a2395fcf8>], 'entities': [<__main__.Entity object at 0x1a2395f390>]}
0   5
--------
{'event_id': 'CNN_IP_20030408.1600.03-EV7', 'mention_id': 'CNN_IP_20030408.1600.03-EV7-1', 'type_': 'Life', 'subtype': 'Injure', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'he was\ntortured here, along with his two brothers, who were executed', 'extent_start': 3222, 'extent_end': 3289, 'scope': "Hamid Fatil (ph) may look like he's acting, but he was\ntortured here, along with his two brothers

9   10
101   108
--------
{'event_id': 'CNNHL_ENG_20030624_230338.34-EV4', 'mention_id': 'CNNHL_ENG_20030624_230338.34-EV4-1', 'type_': 'Contact', 'subtype': 'Phone-Write', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': "the baby's mother made a frantic call to\npolice", 'extent_start': 1154, 'extent_end': 1200, 'scope': "then the baby's mother made a frantic call to\npolice", 'scope_start': 1149, 'scope_end': 1200, 'trig_text': 'call', 'trig_start': 38, 'trig_end': 41, 'arguments': [<__main__.Argument object at 0x10eb854a8>, <__main__.Argument object at 0x1a23a355f8>], 'entities': [<__main__.Entity object at 0x1a24057be0>, <__main__.Entity object at 0x1a240579e8>, <__main__.Entity object at 0x1a24057eb8>]}
46   51
5   21
--------
{'event_id': 'CNNHL_ENG_20030624_230338.34-EV5', 'mention_id': 'CNNHL_ENG_20030624_230338.34-EV5-1', 'type_': 'Life', 'subtype': 'Die', 'modality': 'Other', 'polarity': 'Positive', 'genericity': 'Specific'

52   56
87   104
--------
{'event_id': 'CNN_ENG_20030424_113549.11-EV5', 'mention_id': 'CNN_ENG_20030424_113549.11-EV5-1', 'type_': 'Contact', 'subtype': 'Meet', 'modality': 'Asserted', 'polarity': 'Negative', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': 'neither of the women have ever met\nthe flight attendants', 'extent_start': 1448, 'extent_end': 1503, 'scope': "even though neither of the women have ever met\nthe flight attendants, they say they'll never forget their\nact of kindness", 'scope_start': 1436, 'scope_end': 1556, 'trig_text': 'met', 'trig_start': 43, 'trig_end': 45, 'arguments': [<__main__.Argument object at 0x1a22269898>, <__main__.Argument object at 0x1a222696d8>], 'entities': [<__main__.Entity object at 0x1a22269550>, <__main__.Entity object at 0x1a22269860>, <__main__.Entity object at 0x1a22e9c278>, <__main__.Entity object at 0x1a22e9c780>, <__main__.Entity object at 0x1a22e9c828>]}
12   31
47   67
--------
{'event_id': 'CNN_ENG_20030424_113549.11-EV6'

91   107
0   7
--------
{'event_id': 'CNN_ENG_20030605_065831.18-EV7', 'mention_id': 'CNN_ENG_20030605_065831.18-EV7-1', 'type_': 'Justice', 'subtype': 'Charge-Indict', 'modality': 'Asserted', 'polarity': 'Negative', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': 'bernie evers is living his life and unindicted', 'extent_start': 916, 'extent_end': 961, 'scope': 'bernie evers is living his life and unindicted', 'scope_start': 916, 'scope_end': 961, 'trig_text': 'unindicted', 'trig_start': 36, 'trig_end': 45, 'arguments': [<__main__.Argument object at 0x1a225f0390>], 'entities': [<__main__.Entity object at 0x1a225f0d30>, <__main__.Entity object at 0x1a225f07b8>]}
0   11
--------
{'event_id': 'CNN_ENG_20030605_065831.18-EV8', 'mention_id': 'CNN_ENG_20030605_065831.18-EV8-1', 'type_': 'Justice', 'subtype': 'Charge-Indict', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': "charges brought by the justice department and th

179   191
--------
{'event_id': 'fsh_29195-EV4', 'mention_id': 'fsh_29195-EV4-1', 'type_': 'Personnel', 'subtype': 'Start-Position', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Unspecified', 'extent': 'appoint new people', 'extent_start': 170, 'extent_end': 187, 'scope': 'President Bush has just been elected to another term\nin office, and he has already started to appoint new people to his\npolitical cabinet as current members like Colin Powell and John\nAshcroft resign', 'scope_start': 76, 'scope_end': 274, 'trig_text': 'appoint', 'trig_start': 94, 'trig_end': 100, 'arguments': [<__main__.Argument object at 0x1a22d12a58>], 'entities': [<__main__.Entity object at 0x1a222bd860>, <__main__.Entity object at 0x1a222bd908>, <__main__.Entity object at 0x1a222bd940>, <__main__.Entity object at 0x1a222bd828>, <__main__.Entity object at 0x1a222bd748>, <__main__.Entity object at 0x1a222bd7b8>, <__main__.Entity object at 0x1a222bd7f0>, <__main__.Entity obje

10   18
--------
{'event_id': 'fsh_29782_2-EV9', 'mention_id': 'fsh_29782_2-EV9-3', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Unspecified', 'extent': 'war', 'extent_start': 3971, 'extent_end': 3973, 'scope': 'uh, spoke out against the war publicly and he got reprimanded for it', 'scope_start': 3945, 'scope_end': 4012, 'trig_text': 'war', 'trig_start': 26, 'trig_end': 28, 'arguments': [], 'entities': [<__main__.Entity object at 0x1a239850b8>]}
--------
{'event_id': 'fsh_29782_2-EV10', 'mention_id': 'fsh_29782_2-EV10-1', 'type_': 'Movement', 'subtype': 'Transport', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Future', 'extent': "my cousin's getting ready to go over there", 'extent_start': 586, 'extent_end': 627, 'scope': "I have -- my cousin's getting ready to go over there and it just, oh,\nit just makes me sick", 'scope_start': 576, 'scope_end': 666, 'trig_text': '

31   41
--------
{'event_id': 'APW_ENG_20030416.0581-EV6', 'mention_id': 'APW_ENG_20030416.0581-EV6-2', 'type_': 'Justice', 'subtype': 'Appeal', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': "the court's military board had refused to launch\nproceedings on the appeal", 'extent_start': 1311, 'extent_end': 1384, 'scope': "The lawyer said the court's military board had refused to launch\nproceedings on the appeal", 'scope_start': 1295, 'scope_end': 1384, 'trig_text': 'appeal', 'trig_start': 84, 'trig_end': 89, 'arguments': [<__main__.Argument object at 0x1a23987048>], 'entities': [<__main__.Entity object at 0x1a23991c18>, <__main__.Entity object at 0x1a23991a90>, <__main__.Entity object at 0x1a23991ba8>, <__main__.Entity object at 0x1a23991c50>]}
16   41
--------
{'event_id': 'APW_ENG_20030416.0581-EV7', 'mention_id': 'APW_ENG_20030416.0581-EV7-1', 'type_': 'Justice', 'subtype': 'Trial-Hearing', 'modality': 'Other', 'polarity': 'Negat

25   41
95   116
3   41
--------
{'event_id': 'APW_ENG_20030520.0757-EV3', 'mention_id': 'APW_ENG_20030520.0757-EV3-1', 'type_': 'Movement', 'subtype': 'Transport', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Generic', 'tense': 'Past', 'extent': 'Canberra sent\ntroops to fight in the war on terror in Afghanistan and alongside\nU.S. and British forces in Iraq', 'extent_start': 751, 'extent_end': 861, 'scope': 'Canberra sent\ntroops to fight in the war on terror in Afghanistan and alongside\nU.S. and British forces in Iraq', 'scope_start': 751, 'scope_end': 861, 'trig_text': 'sent', 'trig_start': 9, 'trig_end': 12, 'arguments': [<__main__.Argument object at 0x1a23d78be0>, <__main__.Argument object at 0x1a234f3588>, <__main__.Argument object at 0x1a234f3c18>, <__main__.Argument object at 0x1a234f3ba8>], 'entities': [<__main__.Entity object at 0x1a234f3128>, <__main__.Entity object at 0x1a234f35c0>, <__main__.Entity object at 0x1a234f3d68>, <__main__.Entity object at 0x1

44   54
--------
{'event_id': 'alt.obituaries_20041121.1339-EV6', 'mention_id': 'alt.obituaries_20041121.1339-EV6-1', 'type_': 'Conflict', 'subtype': 'Attack', 'modality': 'Asserted', 'polarity': 'Positive', 'genericity': 'Specific', 'tense': 'Past', 'extent': 'Lebanese one a generation\nago', 'extent_start': 2784, 'extent_end': 2812, 'scope': 'Are Kissinger Associates trying to start a\n"Palestinian Civil War" like their Lebanese one a generation\nago', 'scope_start': 2706, 'scope_end': 2812, 'trig_text': 'one', 'trig_start': 87, 'trig_end': 89, 'arguments': [<__main__.Argument object at 0x1a219ad208>, <__main__.Argument object at 0x1a21564da0>, <__main__.Argument object at 0x1a23c004a8>], 'entities': [<__main__.Entity object at 0x1a219caf28>, <__main__.Entity object at 0x1a219ca438>, <__main__.Entity object at 0x1a219caeb8>, <__main__.Entity object at 0x1a219caf98>]}
78   85
78   85
91   106
--------
{'event_id': 'alt.obituaries_20041121.1339-EV7', 'mention_id': 'alt.obituaries_20041

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [67]:
new_arg

{"Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down": [<__main__.Argument at 0x1a22174668>,
  <__main__.Argument at 0x1a23f1dd30>,
  <__main__.Argument at 0x1a23f1df60>],
 'Davies is leaving to become chairman of the London School of\nEconomics, one of the best-known parts of the University of London': [<__main__.Argument at 0x1a23f1da90>,
  <__main__.Argument at 0x1a2286c828>],
 'As well as previously holding senior positions at Barclays Bank, BZW\nand Kleinwort Benson, McCarthy was formerly a top civil servant at\nthe Department of Trade and Industry': [<__main__.Argument at 0x1a2288b470>,
  <__main__.Argument at 0x1a2288bcc0>,
  <__main__.Argument at 0x1a2288b240>,
  <__main__.Argument at 0x1a2286c518>,
  <__main__.Argument at 0x1a2288bbe0>],
 "British Chancellor of the Exchequer Gordon Brown on Tuesday named the\ncurrent head of the country's energy regulator as the 

In [84]:
sent = "Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down"
print(new_arg[sent][2].start, " ", new_arg[sent][2].end," " ,new_arg[sent][2].role)

114   122   Time-Within


In [78]:
len(events_lang_splits["dev"])

269

In [87]:
arguments_split_dict = {}
for split in events_lang_splits:
    print("Processing split => ", split)
    words_split = []
    for sent in tqdm(events_lang_splits[split]):
        words = []
        for argument in new_split_dict[split][sent]:
            start = 0
            end = argument.start

            ### Tokenize that part that doesn't have to do with triggers and annotate each word as 'O'
            words.extend([(word, "O") for word in  word_tokenize(sent[start:end])])
            #print(sent[start:end])

            ### Tokenize trigger part and annotate each word as 'B' or 'I'
            start = argument.start
            end = argument.end+1
            #print(sent[start:end])
            arg_tok = word_tokenize(sent[start:end])
            flag = True
            for word in arg_tok:
                if flag:
                    flag = False
                    words.append((word, "B-" + arg.role))
                else:
                    words.append((word, "I-" + arg.role))

            start = argument.end + 1
        words_split.append(words)
    arguments_split_dict.update({split: words_split})

 22%|██▏       | 586/2690 [00:00<00:00, 2897.62it/s]

Processing split =>  train


100%|██████████| 2690/2690 [00:01<00:00, 2332.70it/s]
 50%|████▉     | 134/269 [00:00<00:00, 1338.07it/s]

Processing split =>  dev


100%|██████████| 269/269 [00:00<00:00, 1287.25it/s]
100%|██████████| 220/220 [00:00<00:00, 1151.66it/s]

Processing split =>  test





In [254]:
#out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/ArgumentExtraction/"
out_path = "/Users/d22admin/Desktop/vista_cluster/nas/home/meryem/sequence_tagging/data/ACE05/ArgumentExtraction/"
#for lang in ["English", "Chinese", "Arabic"]:
    random.shuffle(arguments_split_dict[lang])
    total = len(words_lang_dict[lang])
    train_num = math.floor(total*0.88)
    dev_num = math.floor(total*0.07)
    train = arguments_split_dict[lang][:train_num]
    dev = arguments_split_dict[lang][train_num:train_num+dev_num]
    test = arguments_split_dict[lang][train_num+dev_num:]
    
    print("len(train):", len(train))
    print("len(dev):", len(dev))
    print("len(test):", len(test))
    
    with open(out_path+lang +"/train.txt", "w") as file:
        for sent in train:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/dev.txt", "w") as file:
        for sent in dev:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                
    with open(out_path+lang +"/test.txt", "w") as file:
        for sent in test:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")

Language:  English
len(train): 2795
len(dev): 222
len(test): 160
Language:  Chinese
len(train): 1699
len(dev): 135
len(test): 97
Language:  Arabic
len(train): 1452
len(dev): 115
len(test): 83


In [83]:
out_path = "/Users/d22admin/USCGDrive/Spring19/ISI/EventExtraction/5Algorithms/EventDetection/sequence_tagging/data/ACE05/ArgumentIdentification/"
for split in arguments_split_dict:
    print("Split: ", split)
    random.shuffle(arguments_split_dict[split])
    total = len(arguments_split_dict[split])
    print("len(train):", len(arguments_split_dict[split]))
    
    with open(out_path+ "English/" + split + ".txt", "w") as file:
        for sent in arguments_split_dict[split]:
            for word, ann in sent:
                file.write(word+ " "+ ann+"\n")
            file.write("\n")
                

Split:  train
len(train): 2690
Split:  dev
len(train): 269
Split:  test
len(train): 220


### Checking the number of Sentences:

def extract_from_xml_check(root_path, language, domain):
    events = {}
    #print(root_path + language + "/" + domain + "/adj/*.apf.xml")
    files_processed = 0
    scopes = {}
    events_num = 0
    if language == "English":
        subpath = "timex2norm"
    else:
        subpath = "adj"
    for file_name in sorted(glob.glob(root_path + language + "/" + domain + "/"+subpath+"/*.apf.xml")):  #in [root_path + language + "/bc/adj/CNN_CF_20030303.1900.00.apf.xml"]:       
        # Get the event + argument annotation
        print("file_name=", file_name)
        files_processed += 1
        tree = ET.parse(file_name, ET.XMLParser(encoding='utf-8'))
        root = tree.getroot()
    
        events_num += len(list(root.iter('event')))
        for event in root.iter('event'):
            for mention in event.iter('event_mention'):
                for child in mention:
                    if child.tag == "ldc_scope":
                        for chil2 in child:
                            scope = chil2.text
                            scope_start = int(chil2.attrib["START"])
                            scope_end = int(chil2.attrib["END"])
                            scopes.update({scope: "scope"})
                            
    print("Number of scopes per domain: ", domain, " is: ", len(scopes))
    print("Number of events: ", events_num)
        
    return scopes, files_processed, events_num

scopes_list_lang = {}

for language in languages:
    print("Processing language: ", language)
    files_num = 0
    events_num_total = 0
    domains = [file_.split("/")[-1] for file_ in glob.glob(root_path + language + "/*" ) if "Icon\r" not in file_]
    scopes_lang = {}
    for domain in domains:
        scopes, files_processed, events_num = extract_from_xml_check(root_path, language, domain)
        files_num += files_processed
        events_num_total += events_num
        
        scopes_lang = merge_two_dicts(scopes_lang, scopes)
    
    print("Number of files processed for language= ", language, " is= ", files_num)
    print("Number of events for language= ", events_num_total)
    
    scopes_list_lang.update({language: scopes_lang})