In [1]:
import math
from tqdm import tqdm

In [2]:
from collections import Counter

In [3]:
def pmi(pair, c_coref_event_count, c_all_events, event_counter, len_coref_event):
  p1 = c_all_events[pair[0]] / event_counter
  p2 = c_all_events[pair[1]] / event_counter
  p_pair = c_coref_event_count[pair] / len_coref_event
  if (p1 * p2) == 0:
    result = 0
  elif p_pair / (p1 * p2) <= 0:
    result = 0
  else:
    result = math.log(p_pair / (p1 * p2), 2)
  return result

In [4]:
def sim(pair, argument, lmbd=1):
  pair_pmi = pmi(pair, c_coref_event_count, c_all_events, event_counter, len_coref_event)
  if c_coref_role_count[(pair[0], pair[1], argument)] > 0:
    result = pair_pmi + (lmbd * math.log(c_coref_role_count[(pair[0], pair[1], argument)], 2))
  else:
    result = pair_pmi
  return result

In [5]:
def chain_score(chain, argument, lmbd=1):
  if len(chain) > 1:
    score = 0
    for event1 in chain[:-1]:
      for event2 in chain[1:]:
        score += sim((event1, event2), argument, lmbd=lmbd)
  else:
    freq = 0
    for key in c_coref_role_count:
      if argument in key and chain[0] in key:
        freq += c_coref_role_count[key]
    if freq != 0:
      score = lmbd * math.log(freq, 2)
    else:
      score = 0
  
  return score

In [6]:
#chainsim for 1 event
def event_arg_similarity(chain, new_ev, arg_list, lmbd=1):
  best_arg = None
  sim_sc = 0
  for argument in arg_list:
    arg_sc = chain_score(chain, argument, lmbd=lmbd)
    new_ev_sc = 0
    for ev in chain:
      new_ev_sc += sim((ev, new_ev), argument, lmbd=lmbd)
    if (arg_sc + new_ev_sc) > sim_sc:
      best_arg = argument
      sim_sc = arg_sc + new_ev_sc
  return sim_sc, best_arg

In [None]:
with open('event_count.txt', 'r', encoding='utf-8') as f:
  counts = f.read().split()

event_counter = 0
for c in counts:
  event_counter += int(c)

In [None]:
with open('coref_event_count.txt', 'r', encoding='utf-8') as f:
  cor_lines = f.readlines()

coref_event_count = []
for line in cor_lines:
  coref_event_count.append((tuple(line.split()[:2]), tuple(line.split()[-2:])))

In [None]:
with open('coref_role_count.txt', 'r', encoding='utf-8') as f:
  cor_lines = f.readlines()

coref_role_count = []
for line in cor_lines:
  coref_role_count.append((tuple(line.split()[:2]), tuple(line.split()[-3:-1]), line.split()[-1]))

In [None]:
with open('all_events.txt', 'r', encoding='utf-8') as f:
  ev_lines = f.readlines()

all_events = []
for line in ev_lines:
  all_events.append(tuple(line.split()))

In [None]:
dep_types = ['nsubj', 'obj']
c_coref_event_count = Counter(coref_event_count)
c_all_events = Counter(all_events)
len_coref_event = len(coref_event_count)

In [None]:
c_coref_role_count = Counter(coref_role_count)

In [None]:
#collecting all arguments
arg_list = []
for key in c_coref_role_count:
  if key[-1] not in arg_list:
    arg_list.append(key[-1])

In [None]:
all_args = []
for key in c_coref_role_count:
  all_args.append(key[-1])

c_all_args = Counter(all_args)

In [None]:
# collecting all verbs
verb_list = []
for event in all_events:
  if event[0] not in verb_list:
    verb_list.append(event[0])

In [None]:
nar_scheme = [[[[('купаться', 'nsubj'),], []],], 
              ['купаться',]]

new_args = []
new_verb = None
max_score = 0
for verb in verb_list:
  if verb not in nar_scheme[-1]:
    verb_score = 0
    for dep in dep_types:
      event_sc = 0
      for chain in nar_scheme[0]:
        sim_sc, best_arg = event_arg_similarity(chain[0], (verb, dep), arg_list, lmbd=1)
        if sim_sc >= event_sc:
          event_sc = sim_sc
      verb_score += event_sc
    if verb_score > max_score:
      new_verb = verb
      max_score = verb_score