In [28]:
import json
from pathlib import Path
from collections import Counter

dir_matched_wiki = Path(f"/Users/mengxiayu/Documents/Research/ComparisonSentences/data/statement_scoring/wiki_v1")
dir_matched_news = Path(f"/Users/mengxiayu/Documents/Research/ComparisonSentences/data/statement_scoring/news_v1")
# dir_matched = Path("/afs/crc.nd.edu/group/dmsquare/vol2/myu2/ComparisonSentences/data/statement_scoring/wiki_v1")



def load_positive_pairs(path):
    # load positive pairs from news matched data.

    properties_to_remove = ["P735", "P31"] # human-defined unwanted properties

    def reorder_pair(pair):
        e1, e2, p, v1, v2 = pair
        if e1 > e2:
            return e2, e1, p, v2, v1
        return pair

    def validate_pair(pair):
        e1, e2, p, v1, v2 = pair
        if p in properties_to_remove:
            return False
        if e1 == v2 and e2 == v1: # remove symmetric pair
            return False
        return True
    positive_pair_freq = Counter()
    with open(path) as f:
        for line in f:
            obj = json.loads(line)
            p = obj["property"]
            e1, e2 = obj["entity_pair"]
            values_e1 = set([x[0][1] for x in obj["evidence_e1"]])
            values_e2 = set([x[0][1] for x in obj["evidence_e2"]])
            positive_pair_freq.update([reorder_pair((e1, e2, p, v1, v2)) for v1 in values_e1 for v2 in values_e2 if validate_pair((e1, e2, p, v1, v2))]) # e1, e2 will in order (e1 < e2), to avoid duplicate pairs
            
    return positive_pair_freq


def load_all_positive_pairs(dir_matched):
    etype2positive_pairs_freq = {}
    for path_matched in dir_matched.glob("*_matched.json"):
        etype = path_matched.stem.rstrip("_matched")
        pairs = load_positive_pairs(path_matched)
        if len(pairs) > 0:
            etype2positive_pairs_freq[etype] = load_positive_pairs(path_matched)
    return etype2positive_pairs_freq
etype2positive_pairs_freq_wiki = load_all_positive_pairs(dir_matched_wiki)
etype2positive_pairs_freq_news = load_all_positive_pairs(dir_matched_news)

In [29]:
len(etype2positive_pairs_freq_wiki.keys())

880

In [30]:
len(etype2positive_pairs_freq_news.keys())

393

In [31]:
def etype2entities(etype2positive_pairs_freq):
    etype2entities = {}
    for etype, positive_pair_freq in etype2positive_pairs_freq.items():
        etype2entities[etype] = set()
        for pair, freq in positive_pair_freq.items():
            e1, e2, p, v1, v2 = pair
            etype2entities[etype].add(e1)
            etype2entities[etype].add(e2)
    return etype2entities
etype2entities_news = etype2entities(etype2positive_pairs_freq_news)
etype2entities_wiki = etype2entities(etype2positive_pairs_freq_wiki)

In [32]:
len(etype2entities_wiki['Q5'])


26368

In [33]:
overlap = set(etype2positive_pairs_freq_wiki.keys()) | set(etype2positive_pairs_freq_news.keys())
print(len(overlap))
for etype in overlap:
    entities_news = etype2entities_news[etype] if etype in etype2entities_news else set()
    entities_wiki = etype2entities_wiki[etype] if etype in etype2entities_wiki else set()
    print(etype, len(entities_news), len(entities_wiki), len(entities_news & entities_wiki))

904
Q66344 0 2 0
Q15265344 0 2 0
Q149621 0 28 0
Q42744322 121 375 70
Q1363599 0 4 0
Q54074585 0 2 0
Q63998451 14 46 11
Q189118 0 2 0
Q70208 65 787 38
Q2927074 0 2 0
Q1147395 28 75 10
Q5356187 0 2 0
Q6558431 6 9 0
Q33506 89 83 11
Q3778417 0 10 0
Q695850 42 36 5
Q32880 0 6 0
Q644371 24 42 5
Q18663579 0 37 0
Q18564289 0 20 0
Q64037785 0 7 0
Q191992 0 4 0
Q726 52 160 20
Q1500350 0 2 0
Q891723 675 143 82
Q167346 22 13 2
Q1802963 0 2 0
Q967098 0 4 0
Q3024240 27 108 19
Q220659 0 2 0
Q1093829 26 52 16
Q1713379 0 4 0
Q748149 0 203 0
Q245065 0 2 0
Q29517555 0 25 0
Q3491915 0 2 0
Q133311 0 2 0
Q71962386 0 2 0
Q15630849 2 60 0
Q32815 8 34 4
Q3192808 0 4 0
Q215380 108 434 23
Q22222786 0 3 0
Q15773347 55 60 16
Q23442 274 1074 151
Q475061 0 4 0
Q7841907 6 9 1
Q193430 0 2 0
Q16024164 4 4 0
Q12042110 0 10 0
Q2154519 7 47 2
Q12140 2 8 0
Q2514025 0 2 0
Q1307276 5 14 2
Q17376093 4 8 3
Q7058673 19 20 5
Q7187 0 26 0
Q150784 3 9 1
Q2292572 0 4 0
Q1855011 0 2 0
Q245016 24 4 1
Q341 0 113 0
Q1107679 0 10 0
Q449