# POC Re-Implementation of Procedure Outlined in "Entity Profiling in Knowledge Graphs" (Zhang Et al.)
Using a subset of wikidata related to Q44 ("beer")

## Creating type mapping

In [47]:
import pandas as pd
from copy import deepcopy

In [73]:
# helper for examining contents of dictionary
def peek_dict(d, max_items = 1):
    count = 1
    for k, v in d.items():
        print(k)
        print(v)
        count += 1
        if count > max_items: break

In [48]:
%%bash
kgtk filter -i Q44/Q44.part.wikibase-item.tsv -o entity_types.tsv --pattern ' ; P31 ; '

In [49]:
entity_types_df = pd.read_csv("entity_types.tsv", sep = "\t")
entity_types_df.drop(columns = ["id","label"], inplace = True)
entity_types_df = entity_types_df.groupby("node1")["node2"].apply(list)
entity_types_df = entity_types_df.T
type_mapping = entity_types_df.to_dict()

In [74]:
peek_dict(type_mapping)

Q1000597
['Q3957']


## Creating label sets
### (AVL and REL only - i.e. no discretization of continuous values, no label containing another label as its value)

In [60]:
def create_label_sets(tsv_edge_file):
    label_sets_by_type = {}
    num_no_type = 0
    num_labels = 0
    quantity_df = pd.read_csv(tsv_edge_file, "\t")
    
    
    for index, row in quantity_df.iterrows():
        entity = row["node1"]
        prop = row["label"]
        value = row["node2"]

        # Is it possible that we find an entity that doesn't have a type?
        if entity not in type_mapping:
            num_no_type += 1
            continue
        # Add this <property, value> pair to the label sets corresponding to this entity's types
        for entity_type in type_mapping[entity]:
            if entity_type not in label_sets_by_type:
                label_sets_by_type[entity_type] = [0,{}]
            if prop not in label_sets_by_type[entity_type][1]:
                label_sets_by_type[entity_type][1][prop] = {}
            if value not in label_sets_by_type[entity_type][1][prop]:
                label_sets_by_type[entity_type][1][prop][value] = 0
            label_sets_by_type[entity_type][1][prop][value] += 1
            label_sets_by_type[entity_type][0] += 1
            num_labels += 1
            
    print("# entities for which we didn't have a type: " + str(num_no_type))
    print("# labels created: " + str(num_labels))
    return label_sets_by_type
        

#### Taking a look at what this does for continuous values...

In [61]:
quantity_label_sets = create_label_sets("Q44/Q44.part.quantity.tsv")

# entities for which we didn't have a type: 0
# labels created: 85379


In [75]:
peek_dict(quantity_label_sets)

Q3957
[3, {'P1082': {'+75074': 1, '+64764': 1}, 'P2044': {'+525Q11573': 1}}]


#### For discrete values...

In [63]:
string_label_sets = create_label_sets("Q44/Q44.part.string.tsv")

# entities for which we didn't have a type: 24
# labels created: 3836


In [76]:
peek_dict(string_label_sets)

Q3957
[11, {'P281': {'DE14': 1, '21400–21499': 1, 'LE67': 1}, 'P373': {'Burton upon Trent': 1, 'Tecate': 1, 'Coalville': 1}, 'P473': {'01283': 1, '665': 1, '01530': 1}, 'P613': {'SK245225': 1, 'SK433138': 1}}]


## TODO:
## 1. Discretization for continuous values --> AIL
## 2. Create RALs

## Initial filtering
### Simple rule-based filter to remove labels that are trivially either unrepresentative or indistinctive

In [65]:
# From paper - alpha empirically set to 1
def get_filtered_labels(label_sets, alpha = .1):
    filtered_labels = deepcopy(label_sets)
    num_filtered = 0
    for entity_type, labels in label_sets.items():
        num_entities = labels[0]
        for prop in labels[1].keys():
            for value, num_pos_entities in labels[1][prop].items():
                support = num_pos_entities / num_entities
                if support < alpha or support > (1 - alpha):
                    num_filtered += 1
                    del filtered_labels[entity_type][1][prop][value]
                    if not filtered_labels[entity_type][1][prop]:
                        del filtered_labels[entity_type][1][prop]
                        if  not filtered_labels[entity_type][1]:
                            del filtered_labels[entity_type]
    print("# labels filtered out: " + str(num_filtered))
    return filtered_labels
                

In [66]:
filtered_string_labels = get_filtered_labels(string_label_sets)

# labels filtered out: 3359


In [77]:
peek_dict(filtered_string_labels, max_items = 5)

Q902814
[8, {'P281': {'21400–21499': 1, '9400': 1}, 'P373': {'Tecate': 1, 'Sopron': 1}, 'P473': {'665': 1, '99': 1}, 'P898': {'ˈʃopron': 1}, 'P935': {'Sopron': 1}}]
Q2221906
[9, {'P2258': {'617': 1}, 'P2979': {'645': 1}, 'P3067': {'609': 1}, 'P373': {'Mauritius': 1}, 'P395': {'MS': 1}, 'P474': {'+230': 1}, 'P487': {'🇲🇺': 1}, 'P898': {"mæʉ\\\\\\\\'ɾɪtsɪʉs": 1}, 'P935': {'Mauritius': 1}}]
Q4198907
[9, {'P2258': {'617': 1}, 'P2979': {'645': 1}, 'P3067': {'609': 1}, 'P373': {'Mauritius': 1}, 'P395': {'MS': 1}, 'P474': {'+230': 1}, 'P487': {'🇲🇺': 1}, 'P898': {"mæʉ\\\\\\\\'ɾɪtsɪʉs": 1}, 'P935': {'Mauritius': 1}}]
Q213907
[5, {'P225': {'Zea mays': 1}, 'P373': {'Zea mays': 1}, 'P487': {'🌽': 1}, 'P627': {'77726273': 1}, 'P935': {'Zea mays': 1}}]
Q15634554
[6, {'P3067': {'390': 1}, 'P3238': {'0': 1}, 'P373': {'Kosovo': 1}, 'P474': {'+383': 1}, 'P487': {'🇽🇰': 1}, 'P935': {'Kosovo': 1}}]


In [69]:
filtered_quantity_labels = get_filtered_labels(quantity_label_sets)

# labels filtered out: 73714


In [78]:
peek_dict(filtered_quantity_labels, max_items = 5)

Q3957
[3, {'P1082': {'+75074': 1, '+64764': 1}, 'P2044': {'+525Q11573': 1}}]
Q902814
[6, {'P1082': {'+64764': 1, '+60528': 1, '+61249': 1, '+62454': 1}, 'P2044': {'+525Q11573': 1}, 'P2046': {'+169.01Q712226': 1}}]
Q131734
[4, {'P2226': {'+3896025.70': 1}, 'P1128': {'+443': 1}, 'P2139': {'+191000000Q4916': 1, '+400000000Q4916': 1}}]
Q15075508
[12, {'P6088': {'+20[+18,+22]': 2}}]
Q40050
[12, {'P6088': {'+20[+18,+22]': 2}}]
