<a href="https://colab.research.google.com/github/nastyaskutina/fairytales/blob/main/results_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import math
from tqdm import tqdm

In [2]:
from collections import Counter

In [3]:
def pmi(pair, c_coref_event_count, c_all_events, event_counter, len_coref_event):
  p1 = c_all_events[pair[0]] / event_counter
  p2 = c_all_events[pair[1]] / event_counter
  p_pair = c_coref_event_count[pair] / len_coref_event
  if (p1 * p2) == 0:
    result = 0
  elif p_pair / (p1 * p2) <= 0:
    result = 0
  else:
    result = math.log(p_pair / (p1 * p2), 2)
  return result

In [4]:
def sim(pair, argument, lmbd=1):
  pair_pmi = pmi(pair, c_coref_event_count, c_all_events, event_counter, len_coref_event)
  if c_coref_role_count[(pair[0], pair[1], argument)] > 0:
    result = pair_pmi + (lmbd * math.log(c_coref_role_count[(pair[0], pair[1], argument)], 2))
  else:
    result = pair_pmi
  return result

In [5]:
def chain_score(chain, argument, lmbd=1):
  if len(chain) > 1:
    score = 0
    for event1 in chain[:-1]:
      for event2 in chain[1:]:
        score += sim((event1, event2), argument, lmbd=lmbd)
  else:
    freq = 0
    for key in c_coref_role_count:
      if argument in key and chain[0] in key:
        freq += c_coref_role_count[key]
    if freq != 0:
      score = lmbd * math.log(freq, 2)
    else:
      score = 0
  
  return score

In [6]:
#chainsim for 1 event
def event_arg_similarity(chain, new_ev, arg_list, lmbd=1):
  best_arg = None
  sim_sc = 0
  for argument in arg_list:
    arg_sc = chain_score(chain, argument, lmbd=lmbd)
    new_ev_sc = 0
    for ev in chain:
      new_ev_sc += sim((ev, new_ev), argument, lmbd=lmbd)
    if (arg_sc + new_ev_sc) > sim_sc:
      best_arg = argument
      sim_sc = arg_sc + new_ev_sc
  return sim_sc, best_arg

In [7]:
with open('event_count.txt', 'r', encoding='utf-8') as f:
  counts = f.read().split()

event_counter = 0
for c in counts:
  event_counter += int(c)

In [8]:
with open('coref_event_count.txt', 'r', encoding='utf-8') as f:
  cor_lines = f.readlines()

coref_event_count = []
for line in cor_lines:
  coref_event_count.append((tuple(line.split()[:2]), tuple(line.split()[-2:])))

In [9]:
with open('coref_role_count.txt', 'r', encoding='utf-8') as f:
  cor_lines = f.readlines()

coref_role_count = []
for line in cor_lines:
  coref_role_count.append((tuple(line.split()[:2]), tuple(line.split()[-3:-1]), line.split()[-1]))

In [10]:
with open('all_events.txt', 'r', encoding='utf-8') as f:
  ev_lines = f.readlines()

all_events = []
for line in ev_lines:
  all_events.append(tuple(line.split()))

In [11]:
dep_types = ['nsubj', 'obj']
c_coref_event_count = Counter(coref_event_count)
c_all_events = Counter(all_events)
len_coref_event = len(coref_event_count)

In [12]:
c_coref_role_count = Counter(coref_role_count)

In [13]:
#collecting all arguments
arg_list = []
for key in c_coref_role_count:
  if key[-1] not in arg_list:
    arg_list.append(key[-1])

In [14]:
all_args = []
for key in c_coref_role_count:
  all_args.append(key[-1])

c_all_args = Counter(all_args)

In [34]:
all_args = []
for key in coref_role_count:
  all_args.append(key[-1])

c_all_args2 = Counter(all_args)

[('королевна', 17805),
 ('Иван', 12837),
 ('буря', 8995),
 ('голова', 8204),
 ('царь', 6753),
 ('царица', 6337),
 ('конь', 6076),
 ('время', 5990),
 ('девица', 5874),
 ('Василис', 5511),
 ('брат', 5252),
 ('орел', 5106),
 ('Елена', 4659),
 ('царевна', 4379),
 ('корабль', 4197),
 ('сын', 3799),
 ('книга', 3177),
 ('нога', 3160),
 ('вода', 3076),
 ('войско', 3072)]

In [15]:
c_all_args.most_common(20)

[('королевна', 11037),
 ('Иван', 7747),
 ('буря', 4712),
 ('голова', 4525),
 ('царь', 4430),
 ('время', 4168),
 ('брат', 3917),
 ('Елена', 3774),
 ('царица', 3593),
 ('девица', 3582),
 ('царевна', 3295),
 ('Василис', 3178),
 ('корабль', 2797),
 ('сын', 2704),
 ('орел', 2392),
 ('нога', 2302),
 ('конь', 2204),
 ('книга', 2124),
 ('вода', 2060),
 ('сестра', 2057)]

In [16]:
# collecting all verbs
verb_list = []
for event in all_events:
  if event[0] not in verb_list:
    verb_list.append(event[0])

In [None]:
c_all_events.most_common(20)


[(('говорить', 'nsubj'), 928),
 (('брать', 'obj'), 848),
 (('стать', 'nsubj'), 528),
 (('быть', 'nsubj'), 491),
 (('брать', 'nsubj'), 476),
 (('пойти', 'nsubj'), 444),
 (('сказать', 'nsubj'), 339),
 (('делать', 'obj'), 335),
 (('отвечать', 'nsubj'), 326),
 (('прийти', 'nsubj'), 318),
 (('жить', 'nsubj'), 262),
 (('отдать', 'obj'), 255),
 (('идти', 'nsubj'), 245),
 (('дать', 'obj'), 231),
 (('нет', 'nsubj'), 227),
 (('стоить', 'nsubj'), 223),
 (('сидеть', 'nsubj'), 198),
 (('выйти', 'nsubj'), 193),
 (('делать', 'nsubj'), 191),
 (('спрашивать', 'nsubj'), 179)]

In [17]:
chain_list = []
for event in tqdm(c_all_events.most_common(20)):
  nar_chain = [event[0]]

  for _ in range(6):
    pmi_sc = 0
    new_ev = None

    for ev in c_all_events:
      pmi_list = []
      for chain_ev in nar_chain:
        pmi_list.append(pmi((chain_ev, ev), c_coref_event_count, c_all_events, event_counter, len_coref_event))
      if sum(pmi_list) > pmi_sc and ev not in nar_chain:
        pmi_sc = sum(pmi_list)
        new_ev = ev
    
    nar_chain.append(new_ev)

  chain_list.append(nar_chain)



100%|██████████| 20/20 [00:02<00:00,  6.79it/s]


In [20]:
sivko_chain = [('бежать', 'nsubj'), ('поласкать', 'obj'), ('убежать', 'nsubj'), ('отпустить', 'obj')]

In [21]:
best_arg = None
best_sc = 0
for arg in arg_list:
  sc = chain_score(chain, arg, lmbd=1)
  if sc > best_sc:
    best_arg = arg
    best_sc = sc

best_arg

'царь'

In [None]:
chain_list

In [18]:
best_args = []
for chain in tqdm(chain_list):
  best_arg = None
  best_sc = 0
  for arg in arg_list:
    sc = chain_score(chain, arg, lmbd=1)
    if sc > best_sc:
      best_arg = arg
      best_sc = sc
  best_args.append(best_arg)

100%|██████████| 20/20 [00:01<00:00, 16.51it/s]


In [28]:
best_args

['образ',
 'жар-птица',
 'конь',
 'де´вица',
 'Булат',
 'голова',
 'Василис',
 'конь',
 'Василис',
 'царь',
 'девица',
 'платье',
 'мост',
 'образ',
 'троижд',
 'вихрь',
 'королевна',
 'королевич',
 'царь',
 'царь']

In [None]:
chain_list[17] #королевич

[('выйти', 'nsubj'),
 ('выучиться', 'nsubj'),
 ('стегнуть', 'nsubj'),
 ('вытащить', 'obj'),
 ('ударить', 'nsubj'),
 ('заскакать', 'nsubj'),
 ('утащить', 'obj')]

In [None]:
chain_list[15] #вихрь

[('стоить', 'nsubj'),
 ('помутиться', 'nsubj'),
 ('уехать', 'obj'),
 ('пить', 'nsubj'),
 ('рвастись', 'nsubj'),
 ('вылетать', 'nsubj'),
 ('ослабеть', 'nsubj')]

In [None]:
chain_list[10] #девица

[('жить', 'nsubj'),
 ('плестись', 'nsubj'),
 ('молвать', 'nsubj'),
 ('отвернуться', 'nsubj'),
 ('прикинуться', 'nsubj'),
 ('расстрелять', 'obj'),
 ('спасти', 'obj')]

In [None]:
chain_list[-2] #царь

[('делать', 'nsubj'),
 ('вздыхать', 'nsubj'),
 ('покличь', 'obj'),
 ('вытянуть', 'obj'),
 ('вынести', 'nsubj'),
 ('требовать', 'nsubj'),
 ('достать', 'nsubj')]

In [None]:
chain_list[9] #царь

[('прийти', 'nsubj'),
 ('вздыхать', 'nsubj'),
 ('покличь', 'obj'),
 ('вытянуть', 'obj'),
 ('вынести', 'nsubj'),
 ('требовать', 'nsubj'),
 ('достать', 'nsubj')]

In [None]:
chain_list[8] #Василис

[('отвечать', 'nsubj'),
 ('приготовить', 'nsubj'),
 ('впустить', 'obj'),
 ('умыться', 'nsubj'),
 ('носить', 'obj'),
 ('понести', 'nsubj'),
 ('отговориться', 'nsubj')]

In [None]:
chain_list[6] #Василис

[('сказать', 'nsubj'),
 ('приготовить', 'nsubj'),
 ('впустить', 'obj'),
 ('умыться', 'nsubj'),
 ('носить', 'obj'),
 ('понести', 'nsubj'),
 ('найти', 'obj')]

In [None]:
chain_list[4] #Булат

[('брать', 'nsubj'),
 ('запрятать', 'nsubj'),
 ('выспросить', 'nsubj'),
 ('разбудить', 'nsubj'),
 ('отрубить', 'nsubj'),
 ('ожить', 'nsubj'),
 ('махнуть', 'nsubj')]

In [None]:
chain_list[1] #жар-птица

[('брать', 'obj'),
 ('испугаться', 'obj'),
 ('сослужить', 'obj'),
 ('сослужать', 'obj'),
 ('увезти', 'obj'),
 ('начать', 'obj'),
 ('удариться', 'obj')]

In [None]:
chain_list[2] #конь

[('стать', 'nsubj'),
 ('усмириться', 'nsubj'),
 ('зацепить', 'nsubj'),
 ('задушить', 'nsubj'),
 ('оживеть', 'nsubj'),
 ('сказаться', 'nsubj'),
 ('запасить', 'obj')]

In [19]:
def arg_to_chain(argument, chain_length=6):
  events = []
  for pair in coref_role_count:
    if argument in pair:
      events.extend(pair[:-1])
  chain = [Counter(events).most_common(1)[0][0]]
  for _ in range(chain_length-1):
    best_sc = 0
    best_ev = None
    for event in c_all_events:
      sc = chain_score(chain+[event], argument, lmbd=1)
      if sc > best_sc and event not in chain:
        best_sc = sc
        best_ev = event
    chain.append(best_ev)
  
  return chain

In [23]:
 arg_to_chain('Сивко', chain_length=6)

[('бежать', 'nsubj'),
 ('почать', 'nsubj'),
 ('сбросить', 'nsubj'),
 ('спрыснуть', 'obj'),
 ('целовать', 'obj'),
 ('остановиться', 'nsubj')]

In [61]:
arg_to_chain('жар-птица', chain_length=6)

[('брать', 'obj'),
 ('испугаться', 'obj'),
 ('сослужить', 'obj'),
 ('сослужать', 'obj'),
 ('увезти', 'obj'),
 ('начать', 'obj')]

In [None]:
b = [1]
print(b+[3])
b

[1, 3]


[1]

In [None]:
def arg_to_chain(argument, chain_length=6):
  chain = []
  for _ in tqdm(range(chain_length)):
    best_sc = 0
    best_ev = None
    for event in c_all_events:
      sc = chain_score(chain+[event], argument, lmbd=1)
      if sc > best_sc and event not in chain:
        best_sc = sc
        best_ev = event
    chain.append(best_ev)
  
  return chain