In [1]:
#Need to set below two environment variables before invoking this jupyter notebook in python 2.7.
#DATAPATH - set to TEES/data folder
#TEES_SETTINGS - set to TEES/settings/.tees_local_settings.py

import sys, os, types
sys.path.append(os.path.dirname(os.curdir))
import TEES.Core.SentenceGraph

import pickle

In [2]:
#DATAPATH should be set to the directory containng TEES/data folder.
DATAPATH = os.getenv('DATAPATH');
CORPUS_DIR = DATAPATH + '/corpora'

In [3]:
folder = CORPUS_DIR
prefix = '/GE11'
devfile = folder + prefix +"-devel.xml"
testfile = folder + prefix +"-test.xml"
trfile = folder + prefix + "-train.xml"

In [4]:
def getSentences(input, parse, tokenization, removeNameInfo=False):
        if type(input) != types.ListType:
            # Load corpus and make sentence graphs
            corpusElements = TEES.Core.SentenceGraph.loadCorpus(input, parse, tokenization, removeNameInfo=removeNameInfo)
            sentences = []
            for sentence in corpusElements.sentences:
                if sentence.sentenceGraph != None: # required for event detection
                    sentences.append( [sentence.sentenceGraph,None] )
            return sentences
        else: # assume input is already a list of sentences
            assert(removeNameInfo == False)
            return input

In [5]:
def getIpData(aSent):
    ip_data_list = list();
    ip_data_wloc={};
    token_list = {}
    dep_dict = {};
    event_dict = {};
    for ent in aSent.entities:
        isEvent = ent.get('event');
        if isEvent:
            event_dict[ent.get('text')] = ent.get('type');
    
    prev_elem = [];
    prev_prev_elem =[];
    tok_loc = 0;
    total_words = len(aSent.tokens);
    for token_elem in aSent.tokens:
        ip_data = {'token':'', 'ltoken':'None', 'sentLoc':'', 'sentLocldep':-0.1, 'sentLocrdep':-0.1, 'rtoken':'None', 'ldtoken':'None', 'rdtoken':'None', 'event':'None', 'pos':'', 'ldpos':'None', 'lldpos':'None', 'rdpos':'None', 'rrdpos':'None', 'lcpos':'None', 'llcpos':'None', 'rcpos':'None', 'rrcpos':'None', 'ldep':'None', 'lldep':'None', 'rdep':'None', 'rrdep':'None', 'id':''}
        token_list[token_elem.get('id')] = token_elem;
        token_val = token_elem.get('text');
        ip_data['token'] = token_val;
        ip_data['pos'] = token_elem.get('POS');
        ip_data['id'] = token_elem.get('id');
        ip_data['sentLoc'] = float(tok_loc)/float(total_words);
        ip_data_wloc[token_elem.get('id')] = ip_data['sentLoc'];
        tok_loc = tok_loc+1;
        if token_val in event_dict:
            ip_data['event'] = event_dict[token_val];
        if prev_elem:
            prev_elem['rcpos'] = ip_data['pos'];
            ip_data['lcpos'] = prev_elem['pos'];
            ip_data['ltoken'] = prev_elem['token'];
            prev_elem['rtoken'] = ip_data['token'];
        if prev_prev_elem:
            prev_prev_elem['rrcpos'] = ip_data['pos'];
            ip_data['llcpos'] = prev_prev_elem['pos'];
        ip_data_list.append(ip_data);
        prev_prev_elem = prev_elem;
        prev_elem = ip_data;
        
        
    for dep in aSent.dependencies:
        t1_id = dep.get('t1');
        t2_id = dep.get('t2');
        dtype = dep.get('type');
        if t1_id in dep_dict:
            dep_obj = dep_dict[t1_id];
            dep_obj['rid'] = t2_id;
            dep_obj['rdep'] = dtype;
        else:
            dep_obj = {'lid':'', 'rid':t2_id, 'ldep':'None', 'rdep':dtype}
        dep_dict[t1_id] = dep_obj;
        
        if t2_id in dep_dict:
            dep_obj = dep_dict[t2_id];
            dep_obj['lid'] = t1_id;
            dep_obj['ldep'] = dtype;
        else:
            dep_obj = {'lid':t1_id, 'rid':'', 'ldep':dtype, 'rdep':'None'}
        dep_dict[t2_id] = dep_obj;
           
        
    for ip in ip_data_list:
        if ip['id'] not in dep_dict:
            continue;
        dep_obj = dep_dict[ip['id']];
        lid = dep_obj['lid'];
        rid = dep_obj['rid'];
        if lid:
            ip['ldpos'] = token_list[lid].get('POS');
            ip['ldtoken'] = token_list[lid].get('text');
            ip['sentLocldep'] = ip_data_wloc[lid];
            dep_l_obj = dep_dict[lid];
            llid = dep_l_obj['lid'];
            if llid:
                ip['lldpos'] = token_list[llid].get('POS');
            ip['lldep'] = dep_l_obj['ldep']
        if rid:
            ip['rdpos'] = token_list[rid].get('POS');
            ip['rdtoken'] = token_list[rid].get('text');
            ip['sentLocrdep'] = ip_data_wloc[rid];
            dep_r_obj = dep_dict[rid];
            rrid = dep_r_obj['rid'];
            if rrid:
                ip['rrdpos'] = token_list[rrid].get('POS');
            ip['rrdep'] = dep_r_obj['rdep'];
        ip['ldep'] = dep_obj['ldep'];
        ip['rdep'] = dep_obj['rdep'];
            
        
    return ip_data_list;    

In [6]:
def writeToFile(aData, aFile):
    with open(aFile, 'wb') as handle:
        pickle.dump(aData, handle, protocol=2)

In [7]:
def ConvertDataToPickle(aIpFile, aOpFile):
    sent_list = getSentences(aIpFile, 'McCC', None)
    ip_data = list();
    for sent in sent_list:
        sent_data = getIpData(sent[0]);
        ip_data.append(sent_data);
    writeToFile(ip_data, aOpFile)

In [8]:
ConvertDataToPickle(trfile, 'processed_data/GE11_tr.pickle')
ConvertDataToPickle(devfile, 'processed_data/GE11_dev.pickle')
ConvertDataToPickle(testfile, 'processed_data/GE11_test.pickle')

Loading corpus file /Volumes/Study/course_work/ljmu_ml_ai/thesis_work/Code/bio_event_detection/TEES/data/corpora/GE11-train.xml
908 documents, 8679 sentences
Making sentence graphs (GE11.d1166.s9): 100.00 % (0:0:15.890)                
Skipped 2095 duplicate interaction edges in SentenceGraphs
Loading corpus file /Volumes/Study/course_work/ljmu_ml_ai/thesis_work/Code/bio_event_detection/TEES/data/corpora/GE11-devel.xml
259 documents, 2902 sentences
Making sentence graphs (GE11.d258.s5): 100.00 % (0:0:4.786)           
Skipped 754 duplicate interaction edges in SentenceGraphs
Loading corpus file /Volumes/Study/course_work/ljmu_ml_ai/thesis_work/Code/bio_event_detection/TEES/data/corpora/GE11-test.xml
347 documents, 3377 sentences
Making sentence graphs (GE11.d1513.s8): 100.00 % (0:0:3.723)         
Skipped 0 duplicate interaction edges in SentenceGraphs
