<a href="https://colab.research.google.com/github/lzanellac/events_into_graphs/blob/main/Graphs_of_events_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [450]:
import os
import networkx as nx
import holoviews as hv
import glob
import nltk
import spacy
import ast
import numpy as np
import matplotlib.pyplot as plt
nltk.download('punkt')
#!python -m spacy download en_core_web_md

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Reading files from directory .txt and .ann
def files_(directory): 
  files = []
  for filename in os.listdir(directory):
    filename = filename.split(".")[0]
    files.append(filename)
    files = list(dict.fromkeys(files))
  print(files)
  return files # files returns a list with the files in the directory (not repeated)

In [3]:
# Reformating the files 
def reformat_events(lines):
  events, num_nod, listevents = [], [], []
  for line in lines:
    if line.startswith('E'):
      events.append(line)
  for ev in range(len(events)):
    types, ids, entities, roles = [], [], [], []
    id_event, event = events[ev].split('\t')[0], events[ev].split('\t')[1]
    args_split = event.split(' ')
    for arg in range(len(args_split)):
      roles.append(args_split[arg].split(':')[0])
      ids.append(args_split[arg].split(':')[1])
      for line1 in lines:
        if line1.startswith(args_split[arg].split(':')[1]+'\t'):
          if line1.startswith('E'): 
            for line in lines:
              if line.startswith(ids[arg]):
                id = line.split(':')[1] 
            for line in lines:
              if line.startswith(id.split(" ")[0]):
                entities.append(line.split('\t')[2])
                type_ = line.split('\t')[1]
                types.append(type_.split(' ')[0])
          else:
            type_ = line1.split('\t')[1]
            types.append(type_.split(' ')[0])
            entities.append(line1.split('\t')[2])
    num_nod.append(len(types))
    listevents.append((ids, types, entities, roles))
  #print(listevents)
  
  return listevents, num_nod


In [83]:
#Add multiple nodes with node level attributes and edges with edge level attributes
def graph(listevents, num_nod, PMID):
  # Create a directed graph G
  G = nx.DiGraph()
  G.graph["Content"] = "Events"

  #entities_per_sentence = nodes_sentence(PMID)

  # Add multiple nodes with node level attributes and edges with edge level attributes
  nodes, cur_node, triggers = 0, 0, []
  for event in range(len(listevents)):
    for ev in range(len(listevents[event][0])):
        G.add_nodes_from([(nodes, {'ent_id' : listevents[event][0][ev], 'type' : listevents[event][1][ev], 'entity' : listevents[event][2][ev]})])
        if ev == 0:
          triggers.append(nodes)
        if nodes < sum(num_nod):
          nodes += 1
    nodes_list = list(G.nodes)
    cur_node = triggers[event]
    for nod in range(len(listevents[event][0])-1):
      G.add_edges_from([(triggers[event], nodes_list[cur_node+1], {"role" : listevents[event][3][nod+1]})])
      cur_node += 1
  
  return G

In [445]:
# Visualization
def vis_graph(G, pmid):
  hv.extension('bokeh')
  graph = hv.Graph.from_networkx(G, nx.layout.fruchterman_reingold_layout).opts(tools=['hover'], directed=True, arrowhead_length=0.04, node_color='type', cmap=['lightsteelblue','steelblue'],
                                                                              node_size=40, edge_hover_line_color='black' , node_hover_fill_color='lightseagreen',
                                                                              edge_color='lightgray', width=700, height=600, title = pmid)
  labels_node = hv.Labels(graph.nodes, ['x', 'y'], 'entity')
  return (graph * labels_node.opts(text_font_size='10pt', text_color='black'))

In [6]:
# Tokenization
def token(PMID):
  f = open(PMID + '.txt', "r")
  f = f.read()
  sentences = nltk.tokenize.sent_tokenize(f)
  return sentences


In [10]:
# Look for the entities and their ids that are contained in each sentence
def nodes_sentence(PMID): 
  nlp = spacy.load('en_core_web_sm')
  sentences, entities, ids_per_sentence = [], [], []
  offset_i, offset_o = 0, 0
  d, ids = {}, {}

  f = open(PMID + '.txt', "r")
  f = f.read()
  doc = nlp(f)
  #sentence = nltk.tokenize.sent_tokenize(f)

  with open(PMID + '.ann') as file:
        lines = file.readlines()
        lines = [line.rstrip() for line in lines]

  for line in lines:
    if line.startswith('T'):
      entities.append(line)
  # Look for the sentences containing the entities in .ann
  for ent in range(len(entities)):
    offset_i, offset_o = (entities[ent].split('\t')[1]).split(' ')[1:]
    word = doc.char_span(int(offset_i), int(offset_o))
    idx = entities[ent].split('\t')[0]
    if word is not None:
      sent = word.sent
      #print(sent)
      sentences.append((sent, word, idx)) # sentences contains the sentence + entity contained + idx of the entity (sentences are repeated for each entity)
  # Merging sublists to have all the entities contained in the sentence in the same sublist [(sentence), entities]
  for k, v, idx in sentences:
    d.setdefault(k, [k]).append((idx, v))
  b = (tuple, d.values())
  entities_per_sentence = list(b[1]) # entities_per_sentence contains list of sentence + all entities contained [(sentence), entities]

  for i in range(len(entities_per_sentence)):
    ids_per_sentence.append([a_tuple[0] for a_tuple in entities_per_sentence[i][1:]]) # ids_per_sentence contains a list of lists of ids of the entities contained per sentence
  
  return entities_per_sentence, ids_per_sentence # entities_per_sentence contains a list of sublists [[sentence], (id, word_entity)], ids_per_sentence contains a list of sublists with entities ids per sentence 

In [8]:
# Merge lists that contain the same first element (autorship from stackoverflow)
def merge_subs(lst_of_lsts):
    res = []
    for row in lst_of_lsts:
        for i, resrow in enumerate(res):
            if row[0]==resrow[0]:
                res[i] += row[1:]
                break
        else:
            res.append(row)
    return res

In [11]:
# Look for the lines in .ann that correspond to the entities in the sentence (given by ids_per_sentence)
def nodes_per_sentence(ids_per_sentence, lines):
  entities, lin_per_sentence, line_per_sentence = [], [], []

  for line in lines:
    if line.startswith('T'):
      entities.append(line)

  for i in range(len(ids_per_sentence)):
    for j in range(len(ids_per_sentence[i])):
      for h in range(len(entities)):
        if entities[h].startswith(ids_per_sentence[i][j]+'\t'):
          lin_per_sentence.append(entities[h])
    #print(lin_per_sentence)    
    line_per_sentence.append(lin_per_sentence)
    lin_per_sentence = []

  return line_per_sentence #line_per_sentence contains a list of lists of the lines in .ann that correspond to the all entities in the sentence 

In [13]:
# Finds all triggers contained among the entities of each sentence (if the sentence doesn't contain triggers, it is left out)
def trig_sentence(line_per_sentence, trigger_types): 
  trig_per_sentence, triggers_per_sentence = [], []
  for ents in range(len(line_per_sentence)):
    for ent in range(len(line_per_sentence[ents])):
      if (line_per_sentence[ents][ent].split('\t')[1]).split(' ')[0] in trigger_types:
        trig_per_sentence.append((line_per_sentence[ents],line_per_sentence[ents][ent].split('\t')[0]))
  triggers_per_sentence = merge_subs(trig_per_sentence) # size(triggers_per_sentence)
  return triggers_per_sentence # triggers_per_sentence contains a list of sublists with [([entities per sentence], all triggers in the entities)]

In [23]:
# Read the file with the dictionary containing the pre-defined trigger types of the model
def list_triggers(triggers_file):
  triggers_file = open(triggers_file, 'r')
  content = triggers_file.read()
  trigger_dict = ast.literal_eval(content)
  triggers_file.close()
  trigger_types = trigger_dict['trigger_types']
  return trigger_types # trigger_types contains a list of the pre-defined trigger types of the model

In [195]:
# Finds events corresponding to each sentence
def event_per_sentence(triggers_per_sentence, list_events):
  ev_per_sentence, evs_per_sentence, evs_per_abstract = [], [], []
  for abstract in range(len(triggers_per_sentence)):
    for sentence in range(len(triggers_per_sentence[abstract])):
      for event in range(len(list_events)):
        if list_events[event][0][0] in list(triggers_per_sentence[abstract][sentence][1:]):
          #print('sentence = ', sentence)#, ' ',list_events[event][0][0], list_events[event][0], '', list(triggers_per_sentence[abstract][sentence][1:]))
          ev_per_sentence.append(list_events[event][0])
          #print(ev_per_sentence)
      if ev_per_sentence != []:
        evs_per_sentence.append((list(triggers_per_sentence[abstract][sentence][1:]), ev_per_sentence))
        #print('evs_per_sentence = ' , evs_per_sentence)
        if evs_per_sentence != []:
          events_per_sentence.append(evs_per_sentence)
          print(events_per_sentence)
      ev_per_sentence, evs_per_sentence = [], []
  #print('events_per_abstract = ', evs_per_abstract)
  return evs_per_abstract, evs_per_sentence # return a list of the events per abstract and a list of events per sentence


In [63]:
# Reads triggers_per_sentence ([entities], triggers) and reformat them for the creation of graphs
def reformat_sent(triggers_per_sentence):
  id_ent, type_ent, entity, format_sent, num_nod_sent = [], [], [], [], []
  for i in range(len(triggers_per_sentence)):
    for j in range(len(triggers_per_sentence[i][0])):
      id_ent.append(triggers_per_sentence[i][0][j].split('\t')[0])
      #print(id_ent)
      type_ent.append((triggers_per_sentence[i][0][j].split('\t')[1]).split(' ')[0])
      entity.append(triggers_per_sentence[i][0][j].split('\t')[2])
    num_nod_sent.append(len(id_ent))
    format_sent.append((id_ent, type_ent, entity))
    id_ent, type_ent, entity = [], [], []
  return format_sent, num_nod_sent # format_sent returns a list of nodes per sentence [id_ent, type_ent, entity] and num_nod_sent returns a list with the # of nodes in the sentence

In [418]:
#Add multiple nodes with node level attributes and edges with edge level attributes
def graph_sent(format_sent, num_nod_sent):
  # Create a directed graph G
  G = nx.DiGraph()
  G.graph["Content"] = "Events"
  graphs = []

  # Add multiple nodes with node level attributes and edges with edge level attributes
  nodes, cur_node, triggers, nodes_events = 0, 0, [], []
  for sentence in range(len(format_sent)):
    for sents in range(len(format_sent[sentence])):
      for sent in range(len(format_sent[sentence][0])):
        G.add_nodes_from([(nodes, {'ent_id' : format_sent[sentence][0][sent], 'type' : format_sent[sentence][1][sent], 'entity' : format_sent[sentence][2][sent]})])  
        if nodes < sum(num_nod_sent):
          nodes += 1
      nodes, cur_node = 0, 0

    graphs.append(G)
    G = nx.DiGraph()
    G.graph["Content"] = "Events"
  return graphs

In [525]:
def edges_sentence(graphs, list_events):
  nodes_id, nodes, nodes_events = [], [], []
  for i in range(len(graphs)): # abstract level
    for j in range(len(graphs[i])):
      g = graphs[i].nodes()
      nodes.append(g[j]['ent_id'])
      nodes_id = np.array(list(graphs[i].nodes()))      
    for h in range(len(list_events)): 
      #### code from geeksforgeeks.org
      flag = 0
      if (all(x in nodes for x in list_events[h][0])):
        flag = 1   
      if (flag) :
      ####
        event = list_events[h][0]
        for s in range(len(event)):
          for p in range(len(nodes)):
            if event[s] == nodes[p]:
              nodes_events.append(p) # nodes_ids of the events of the sentence
        for nod in range(len(nodes_events)-1):
          graphs[i].add_edges_from([(nodes_events[0], nodes_events[nod+1], {"role" : list_events[h][3][nod+1]})])
        #for edge in graphs[i].edges(data=True):
        #  print(edge)
        nodes_events = [] 
    nodes = []
    print(graphs[i])
  return graphs


In [557]:
 def static_vis(graphs, PMID):
   for i in range(len(graphs)):
    edge_labels = nx.get_edge_attributes(graphs[i], "role")
    node_lab = nx.get_node_attributes(graphs[i], "entity")
    pos = nx.spring_layout(graphs[i], k = 0.2)
    fig, ax = plt.subplots(figsize=(12,12))
    ax.set_title(PMID)
    ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True)
    nx.draw(graphs[i], pos, labels=node_lab, node_size=1500, node_color="steelblue", edgecolors='black', arrows = True, ax = ax)
    nx.draw_networkx_edges(graphs[i], pos=pos)
    nx.draw_networkx_edge_labels(graphs[i], pos, edge_labels) 
    limits=plt.axis('on') # turns on axis
    ax.tick_params(left=True, bottom=True, labelleft=True, labelbottom=True)

    

In [547]:
PMID = 'pmid/PMID-26986801'
triggers_file = "cg.types"
trigger_types = list_triggers(triggers_file)

with open(PMID + '.ann') as file:
      lines = file.readlines()
      lines = [line.rstrip() for line in lines] # lines returns a list of the lines of each abstract
    
entities_per_sentence, ids_per_sentence = nodes_sentence(PMID) # get the entities and ids that correspond to each sentence
lines_per_sentence = nodes_per_sentence(ids_per_sentence, lines) # get the lines of the .ann that correspond to each sentence of the abstract
triggers_per_sentence = trig_sentence(lines_per_sentence, trigger_types)
list_events, num_nod = reformat_events(lines)
format_sent, num_nod_sent = reformat_sent(triggers_per_sentence)
graphs = graph_sent(format_sent, num_nod_sent)
graphs = edges_sentence(graph_sent(format_sent, num_nod_sent), list_events)
print(graphs[0].nodes(data=True))

DiGraph with 3 nodes and 2 edges
DiGraph with 4 nodes and 0 edges
DiGraph with 7 nodes and 2 edges
DiGraph with 3 nodes and 0 edges
DiGraph with 7 nodes and 3 edges
[(0, {'ent_id': 'T1', 'type': 'Simple_chemical', 'entity': 'Gallate'}), (1, {'ent_id': 'T2', 'type': 'Simple_chemical', 'entity': 'Gallate Derivatives'}), (2, {'ent_id': 'T32', 'type': 'Planned_process', 'entity': 'Encapsulation'})]


In [None]:
#graphs = graph_sent(format_sent, num_nod_sent)
graphs = edges_sentence(graph_sent(format_sent, num_nod_sent), list_events)
print(graphs[0])
vis_graph(graphs[0], 'PMID-26986801')
static_vis(graphs, PMID)



In [549]:
def main():
  # Iteration over the files in the directory
  files = files_(path)
  ev_abstract, num_nod_abstract, graphs, sentences, triggers_per_sentence = [], [], [], [], []
  trigger_types = list_triggers(triggers_file) #get the list of triggers from the cg.types files
  for filess in range(len(glob.glob1(path,"*.ann"))):
  #for filess in range(len(files)):
    pmid = files[filess] 
    PMID = os.path.join(path, pmid)
    print(PMID)

    #sentences.append(token(PMID))
    #print(sentences)

    with open(PMID + '.ann') as file:
      lines = file.readlines()
      lines = [line.rstrip() for line in lines] # lines returns a list of the lines of each abstract
    
    entities_per_sentence, ids_per_sentence = nodes_sentence(PMID) # get the entities and ids that correspond to each sentence
    lines_per_sentence = nodes_per_sentence(ids_per_sentence, lines) # get the lines of the .ann that correspond to each sentence of the abstract
    triggers_per_sentence = trig_sentence(lines_per_sentence, trigger_types) # get a list of lists of the ([lines_entities] + id_triggers) contained in each sentence contained in each abstract
    # Changing the format of files to be used for the graphs
    list_events, num_nod = reformat_events(lines) # list of events per abstract
    '''
    evs_per_abstract, evs_per_sentence = event_per_sentence(triggers_per_sentence, list_events)
    '''
    # Changing the format of lines per sentence to be used for the graphs
    format_sent, num_nod_sent = reformat_sent(triggers_per_sentence)
    print(format_sent)
    print(len(format_sent))
    print(num_nod_sent)
    # Construction of graphs
    graphs = graph_sent(format_sent, num_nod_sent) # returns a list with the graphs corresponding to each abstract
    graphs = edges_sentence(graph_sent(format_sent, num_nod_sent), list_events)
    print(graphs)
    for gr in range(len(graphs)): 
      H = vis_graph(graphs[gr], pmid)
      H = H + H
    #graphs = edges_sentence(graph_sent(format_sent, num_nod_sent), list_events)
    static_vis(graphs, pmid)
  
  return graphs

In [None]:
if __name__ == "__main__":
  #path = '/home/lzanella/orpailleur/lzanella/biomolecules-project/Information_extraction/Event_extraction/Graphs/abstracts'
  path = "pmid" # path to the .ann and .txt files
  triggers_file = "cg.types" # path to the file with the triggers types
  graphs = main()