In [10]:
import os
import gc
import sys
import time
import math
import json
import spacy
import random
import logging
import argparse
import itertools
import numpy as np
import _pickle as cPickle
import torch
os.chdir("../../src/shared/")
from classes import *
os.chdir("../../data/feature/")

In [11]:
logging.info('Loading training and dev data...')
with open('training_data',"rb") as f:
    training_data = cPickle.load(f)
with open("dev_data", 'rb') as f:
    dev_data = cPickle.load(f)

In [116]:
entity_mentions = []
event_mentions = []
for _, topic in training_data.topics.items():
    docs = topic.docs
    for _, doc in docs.items():
        for _, sent in doc.sentences.items():
            entity_mentions.extend(sent.gold_entity_mentions)
            event_mentions.extend(sent.gold_event_mentions)

In [117]:
entity_d = {}
event_d = {}

for entity in entity_mentions:
    tag = entity.gold_tag
    if tag in entity_d:
        entity_d[tag].append(entity)
    else:
        entity_d[tag] = [entity]

entity_d

{'HUM16236184328979740': [<classes.EntityMention at 0x7f4ecb41ea20>,
  <classes.EntityMention at 0x7f4ecb41eb70>,
  <classes.EntityMention at 0x7f4ecb3d29e8>,
  <classes.EntityMention at 0x7f4ecb3d2ba8>,
  <classes.EntityMention at 0x7f4ecb39d4a8>,
  <classes.EntityMention at 0x7f4ecb350a90>,
  <classes.EntityMention at 0x7f4ecb350c50>,
  <classes.EntityMention at 0x7f4ecb372da0>,
  <classes.EntityMention at 0x7f4ecb3274e0>,
  <classes.EntityMention at 0x7f4ecb3276a0>,
  <classes.EntityMention at 0x7f4ecb327dd8>,
  <classes.EntityMention at 0x7f4ecb2fa358>,
  <classes.EntityMention at 0x7f4ecb2abf98>,
  <classes.EntityMention at 0x7f4ecb266198>,
  <classes.EntityMention at 0x7f4ecb266358>,
  <classes.EntityMention at 0x7f4ecb266518>,
  <classes.EntityMention at 0x7f4ecb2114a8>,
  <classes.EntityMention at 0x7f4ecb211668>,
  <classes.EntityMention at 0x7f4ecb211828>,
  <classes.EntityMention at 0x7f4ecb23a6d8>,
  <classes.EntityMention at 0x7f4ecb23a898>,
  <classes.EntityMention at 0x7

In [118]:
event_d = {}
for event in event_mentions:
    tag = event.gold_tag
    if tag in event_d:
        event_d[tag].append(event)
    else:
        event_d[tag] = [event]

event_d

{'ACT16236402809085484': [<classes.EventMention at 0x7f4ecb41e6d8>,
  <classes.EventMention at 0x7f4ecb3d20f0>,
  <classes.EventMention at 0x7f4ecb3d2320>,
  <classes.EventMention at 0x7f4ecb38cf60>,
  <classes.EventMention at 0x7f4ecb3506d8>,
  <classes.EventMention at 0x7f4ecb372c18>,
  <classes.EventMention at 0x7f4ecb310f28>,
  <classes.EventMention at 0x7f4ecf77eac8>,
  <classes.EventMention at 0x7f4ecb2eef60>,
  <classes.EventMention at 0x7f4ecb2abcf8>,
  <classes.EventMention at 0x7f4ecb266f60>,
  <classes.EventMention at 0x7f4ecb2112e8>,
  <classes.EventMention at 0x7f4ecb23a3c8>,
  <classes.EventMention at 0x7f4ecb1e2cc0>,
  <classes.EventMention at 0x7f4ecb1e2e48>,
  <classes.EventMention at 0x7f4ecb1448d0>,
  <classes.EventMention at 0x7f4ecb0bd0b8>,
  <classes.EventMention at 0x7f4ecb07e780>,
  <classes.EventMention at 0x7f4ecb09c908>,
  <classes.EventMention at 0x7f4ecb03ecc0>,
  <classes.EventMention at 0x7f4ecb00d0b8>,
  <classes.EventMention at 0x7f4ecafc7c88>,
  <class

In [146]:
import itertools
def findsubsets(s, n):
    return list(itertools.combinations(s, n))

cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

result = []
for i,v in entity_d.items():
    if len(v)>1: # singletons
        subsets = findsubsets(v,2)
        for sebset in subsets:
            result.append(round(cos(sebset[0].head_elmo_embeddings, sebset[1].head_elmo_embeddings).tolist(),4))

In [147]:
l = len(result)
pos = [r for r in result if r>0.5]
len(pos)/l

0.4093120934282641

In [148]:
from collections import Counter
result = Counter(result).most_common()
result

[(1.0, 124),
 (0.1069, 20),
 (0.2763, 19),
 (0.1154, 18),
 (0.2212, 17),
 (0.3263, 17),
 (0.36, 17),
 (0.3264, 17),
 (0.2331, 16),
 (0.2781, 15),
 (0.2216, 15),
 (0.1797, 15),
 (0.1974, 15),
 (0.1638, 15),
 (0.1789, 15),
 (0.2523, 15),
 (0.2429, 15),
 (0.1636, 14),
 (0.2959, 14),
 (0.1029, 14),
 (0.1389, 14),
 (0.2457, 14),
 (0.2412, 14),
 (0.208, 14),
 (0.1747, 14),
 (0.2582, 14),
 (0.1745, 14),
 (0.1151, 14),
 (0.1828, 14),
 (0.1785, 14),
 (0.2736, 14),
 (0.1701, 14),
 (0.9999, 14),
 (0.1692, 14),
 (0.1763, 14),
 (0.3027, 14),
 (0.1784, 13),
 (0.3375, 13),
 (0.1518, 13),
 (0.1566, 13),
 (0.1543, 13),
 (0.1602, 13),
 (0.1782, 13),
 (0.1781, 13),
 (0.3238, 13),
 (0.1874, 13),
 (0.1867, 13),
 (0.2068, 13),
 (0.2685, 13),
 (0.2182, 13),
 (0.1829, 13),
 (0.2044, 13),
 (0.1454, 13),
 (0.202, 13),
 (0.2159, 13),
 (0.2643, 13),
 (0.1838, 13),
 (0.2642, 13),
 (0.2061, 13),
 (0.2509, 13),
 (0.2041, 13),
 (0.2696, 13),
 (0.2259, 13),
 (0.343, 13),
 (0.176, 13),
 (0.2015, 13),
 (0.2401, 13),
 (0

## Testing the structure

In [15]:
topics = training_data.topics

In [16]:
topics

{'1_ecb': <classes.Topic at 0x7f4ecb40c860>,
 '3_ecb': <classes.Topic at 0x7f4ecac0ce80>,
 '4_ecb': <classes.Topic at 0x7f4ecaa32c18>,
 '6_ecb': <classes.Topic at 0x7f4eca5fec50>,
 '7_ecb': <classes.Topic at 0x7f4eca46f7f0>,
 '8_ecb': <classes.Topic at 0x7f4eca1ecfd0>,
 '9_ecb': <classes.Topic at 0x7f4ec9f4f780>,
 '10_ecb': <classes.Topic at 0x7f4ec9be5a90>,
 '11_ecb': <classes.Topic at 0x7f4ec99f8160>,
 '13_ecb': <classes.Topic at 0x7f4ec9771780>,
 '14_ecb': <classes.Topic at 0x7f4ec928ac50>,
 '16_ecb': <classes.Topic at 0x7f4ec90ea240>,
 '19_ecb': <classes.Topic at 0x7f4ec9034cf8>,
 '20_ecb': <classes.Topic at 0x7f4ec8c0ecf8>,
 '22_ecb': <classes.Topic at 0x7f4ec8ab77b8>,
 '24_ecb': <classes.Topic at 0x7f4ec87bda90>,
 '25_ecb': <classes.Topic at 0x7f4ec84aba90>,
 '26_ecb': <classes.Topic at 0x7f4ec7f36278>,
 '27_ecb': <classes.Topic at 0x7f4ec78ac5f8>,
 '28_ecb': <classes.Topic at 0x7f4ec6f5f5f8>,
 '29_ecb': <classes.Topic at 0x7f4ec68f4898>,
 '30_ecb': <classes.Topic at 0x7f4ec63cf1

In [19]:
all_docs = []
for i,topic in topics.items():
        all_docs.append(topic.docs)
all_docs[0]

{'1_10ecb': <classes.Document at 0x7f4ecb40cd30>,
 '1_11ecb': <classes.Document at 0x7f4ecb38c518>,
 '1_12ecb': <classes.Document at 0x7f4ecb372400>,
 '1_13ecb': <classes.Document at 0x7f4ecb310358>,
 '1_14ecb': <classes.Document at 0x7f4ecb3ffe80>,
 '1_15ecb': <classes.Document at 0x7f4ecb211a20>,
 '1_17ecb': <classes.Document at 0x7f4ecb1f9c50>,
 '1_18ecb': <classes.Document at 0x7f4ecb0fddd8>,
 '1_19ecb': <classes.Document at 0x7f4ecb07e2e8>,
 '1_1ecb': <classes.Document at 0x7f4ecb05d7f0>,
 '1_2ecb': <classes.Document at 0x7f4ecafea940>,
 '1_3ecb': <classes.Document at 0x7f4ecaf6b080>,
 '1_4ecb': <classes.Document at 0x7f4ecaf7b828>,
 '1_5ecb': <classes.Document at 0x7f4ecaee09e8>,
 '1_6ecb': <classes.Document at 0x7f4ecae70a58>,
 '1_7ecb': <classes.Document at 0x7f4ecada0128>,
 '1_8ecb': <classes.Document at 0x7f4ecacec9b0>,
 '1_9ecb': <classes.Document at 0x7f4ecac489e8>}

In [100]:
sentences = []
for i,doc in all_docs[0].items():
        sentences.append(doc.sentences)
sentences

[{0: <classes.Sentence at 0x7f4ecb40cda0>,
  3: <classes.Sentence at 0x7f4ecb3c32e8>},
 {0: <classes.Sentence at 0x7f4ecb38c550>,
  2: <classes.Sentence at 0x7f4ecb39dda0>},
 {1: <classes.Sentence at 0x7f4ecb372438>},
 {1: <classes.Sentence at 0x7f4ecb310390>},
 {0: <classes.Sentence at 0x7f4ecb3ffa90>,
  1: <classes.Sentence at 0x7f4ecb2ee1d0>,
  2: <classes.Sentence at 0x7f4ecb2fac18>,
  5: <classes.Sentence at 0x7f4ecb266a20>},
 {0: <classes.Sentence at 0x7f4ecb211a58>,
  1: <classes.Sentence at 0x7f4ecb23ae48>},
 {0: <classes.Sentence at 0x7f4ecb1f9c88>,
  1: <classes.Sentence at 0x7f4ecb144438>,
  2: <classes.Sentence at 0x7f4ecb144dd8>},
 {0: <classes.Sentence at 0x7f4ecb0fde10>},
 {0: <classes.Sentence at 0x7f4ecb07e320>,
  1: <classes.Sentence at 0x7f4ecb07ec88>,
  3: <classes.Sentence at 0x7f4ecb03e3c8>},
 {1: <classes.Sentence at 0x7f4ecb05d828>,
  2: <classes.Sentence at 0x7f4ecafc75c0>},
 {0: <classes.Sentence at 0x7f4ecafea978>,
  2: <classes.Sentence at 0x7f4ecaf92a58>},


In [101]:
event_mentions = []
entity_mentions = []
for i,sent in sentences[0].items():
    if i == 0:
        entity_mentions.append(sent.gold_entity_mentions)
        event_mentions.append(sent.gold_event_mentions)
        


In [102]:
event_mentions

[[<classes.EventMention at 0x7f4ecb41e6d8>,
  <classes.EventMention at 0x7f4ecb41e898>]]

In [103]:
entity_mentions

[[<classes.EntityMention at 0x7f4ecb41ea20>,
  <classes.EntityMention at 0x7f4ecb41eb70>,
  <classes.EntityMention at 0x7f4ecb41ed68>,
  <classes.EntityMention at 0x7f4ecb41ef28>,
  <classes.EntityMention at 0x7f4ecb3c3128>]]

In [106]:
entity_mentions[0][0].get_tokens()

['Tara', 'Reid']

In [107]:
event_mentions[0][0].gold_tag

'ACT16236402809085484'