In [182]:
import json
import numpy as np
import torch
import nltk
from nltk.tokenize import word_tokenize
from textstat import textstat
import pandas as pd

In [157]:
pred = torch.load('./output/models/triple_ranks_all-2023-12-13-13-59.pt')
maps = torch.load('./data/FB15k-237/maps.pt')

In [158]:
ent_ids, rel_ids = maps['ent_ids'], maps['rel_ids']
ent_uris = {v: k for k, v in ent_ids.items()}
rel_uris = {v: k for k, v in rel_ids.items()}

ent_desc = {}
for line in open('./data/FB15k-237/entity2textlong.txt'):
    line = line.strip().split('\t')
    ent_desc[line[0]] = line[1]

In [162]:
triple_char = []
for head, relation, tail, head_rank, tail_rank in pred.tolist():
    t = {
        'head_id': head,
        'head': ent_uris[head],
        'head_rank': head_rank,
        'tail_id': tail,
        'tail': ent_uris[tail],
        'tail_rank': tail_rank,
        'avg_ranks': (head_rank + tail_rank) / 2,

    }

    if ent_uris[head] in ent_desc:
        t['head_desc'] = ent_desc[ent_uris[head]]
    if ent_uris[tail] in ent_desc:
        t['tail_desc'] = ent_desc[ent_uris[tail]]

    triple_char.append(t)

Add description length

In [163]:
for triple in triple_char:
    if 'head_desc' in triple:
        triple['head_desc_len'] = len(word_tokenize(triple['head_desc']))
    if 'tail_desc' in triple:
        triple['tail_desc_len'] = len(word_tokenize(triple['tail_desc']))


Add Flesch-Kincaid Grade Level

In [164]:
for triple in triple_char:
    if 'head_desc' in triple:
        triple['head_flesch'] = textstat.flesch_kincaid_grade(triple['head_desc'])
    if 'tail_desc' in triple:
        triple['tail_flesch'] = textstat.flesch_kincaid_grade(triple['tail_desc'])

Add number of links

In [168]:
page_link_graph = torch.load('./data/FB15k-237/page_link_graph_typed.pt')

ent_num_links = {}
for head, relation, tail in page_link_graph:
    head = head.item()
    if head not in ent_num_links:
        ent_num_links[head] = 0
    ent_num_links[head] += 1

for triple in triple_char:
    if triple['head_id'] in ent_num_links:
        triple['head_num_links'] = ent_num_links[triple['head_id']]
    if triple['tail_id'] in ent_num_links:
        triple['tail_num_links'] = ent_num_links[triple['tail_id']]

In [189]:
pearson = []
for rank_type in ['head_rank', 'tail_rank', 'avg_ranks']:
    pearson.append([])
    for characteristic in ['head_desc_len', 'tail_desc_len', 'head_flesch', 'tail_flesch', 'head_num_links', 'tail_num_links']:
        ranks = []
        characteristics = []

        for triple in triple_char:
            if rank_type in triple and characteristic in triple:
                ranks.append(triple[rank_type])
                characteristics.append(triple[characteristic])

        pearson[-1].append(np.corrcoef(np.array(ranks), np.array(characteristics))[0,1])

In [190]:
pd.DataFrame(pearson, columns=['head_desc_len', 'tail_desc_len', 'head_flesch', 'tail_flesch', 'head_num_links', 'tail_num_links'], index=['head_rank', 'tail_rank', 'avg_ranks'])

Unnamed: 0,head_desc_len,tail_desc_len,head_flesch,tail_flesch,head_num_links,tail_num_links
head_rank,-0.031793,-0.015324,-0.065488,-0.015589,0.052671,-0.005927
tail_rank,-0.004427,-0.025346,0.019666,-0.075631,0.000998,0.077713
avg_ranks,-0.022076,-0.023101,-0.029986,-0.050222,0.033164,0.038237


ngram overlap beetween neighbors
number of entities by spacy
https://arxiv.org/pdf/1904.09675.pdf