# Perform coreference resolution

In [3]:
import json

def read_text(filename):
    raw_text = ''
    with open(filename) as file:
        for line in file:
            raw_text += line
    return raw_text

def write_text(text, filename):
    with open(filename, 'w') as file:
        for line in text:
            file.write(line)
            
def read_json(filename):
    with open(filename) as file:
        data = json.load(file)
    return data

def write_json(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file)

## Manually define canonicals and replace in text

In [4]:
def replace_predefined_canonicals(triples, canonicals):
    for sentence in triples:
        for e in range(len(triples[sentence])):
            for word in canonicals:
                if triples[sentence][e]['subject'] in canonicals[word]:
                    triples[sentence][e]['subject'] = word
                for obj in range(len(triples[sentence][e]['object'])):
                    if triples[sentence][e]['object'][obj] in canonicals[word]:
                        triples[sentence][e]['object'][obj] = word


## Group canonicals and replace with one element of group

In [5]:
class DSU:
    def __init__(self, array):
# Code for DSU
        self.parent = [i for i in range(len(array))]
        self.size = [1 for i in range(len(array))]
    def find(self, x):
        p = x
        while p != self.parent[p]:
            p = self.parent[p]
        temp = x
        while temp != self.parent[temp]:
            t = self.parent[temp]
            self.parent[temp] = p
            temp = t
        return p 

    def combine(self, x, y):
        x = self.find(x)
        y = self.find(y)
        if x == y:
            return
        if self.size[x] > self.size[y]:
            self.parent[y] = x
            self.size[x] += self.size[y]
        else:
            self.parent[x] = y
            self.size[y] += self.size[x]

# similar if word overlap is greater than 50%
def similar(entity1, entity2):
    words1 = list(set(entity1.split()))
    words2 = list(set(entity2.split()))
    common = 0
    for word in words1:
        common += words2.count(word)
    return (2 * common > 0.5 * (len(words1) + len(words2)))


def cluster_similar_words(triples):
    # All entities
    entities = set([])
    for sentence in triples:
        for extraction in triples[sentence]:
            entities.add(extraction['subject'])
            for obj in extraction['object']:
                entities.add(obj)
    entities = list(entities)

    dsu = DSU(entities)

    # combine similar words
    for e1 in range(len(entities)):
        for e2 in range(len(entities)):
            if similar(entities[e1], entities[e2]):
                dsu.combine(e1, e2)

    parent_of_word = {}
    for e in range(len(entities)):
        parent_of_word[entities[e]] = entities[dsu.find(e)]

    # replace entity by parent entity
    for sentence in triples:
        for e in range(len(triples[sentence])):
            triples[sentence][e]['subject'] = parent_of_word[triples[sentence][e]['subject']]
        for obj in range(len(triples[sentence][e]['object'])):
            triples[sentence][e]['object'][obj] = parent_of_word[triples[sentence][e]['object'][obj]]
    
    for e in range(len(entities)):
        entities[e] = parent_of_word[entities[e]]
    print('original entities:', len(entities))
    entities = list(set(entities))
    print('reduced entities:', len(entities))

## Create relation "similar to" between canonicals

In [6]:
# mark entity 1 as similar to the entity 2
def get_similar_to_edges(triples):
    # All entities
    entities = set([])
    for sentence in triples:
        for extraction in triples[sentence]:
            entities.add(extraction['subject'])
            for obj in extraction['object']:
                entities.add(obj)
    entities = list(entities)
    
    similar_to_edges = []
    for e1 in range(len(entities)):
        for e2 in range(len(entities)):
            if e1 == e2:
                continue
            if similar(entities[e1], entities[e2]):
                similar_to_edges.append([entities[e1], 'similar to', entities[e2]])
    
    return similar_to_edges

# Canonicalisation using CESI

In [16]:
from nltk.stem import WordNetLemmatizer
import requests

API_ENDPOINT = "https://www.wikidata.org/w/api.php"

def prepare_cesi_triples(triples, filename):
    wordnet_lemmatizer = WordNetLemmatizer()
    cesi_triples = []
    id_ = 0
    count_linked = 0
    count_unlinked = 0
    for sentence in triples:
        for extraction in triples[sentence]:
            # Get true link of the subject and object from wikidata
            sub_true_link = requests.get(API_ENDPOINT, params = {
                        'action': 'wbsearchentities',
                        'format': 'json',
                        'language': 'en',
                        'search': extraction['subject']
                    }).json()['search']
            if len(sub_true_link) > 0:
                sub_true_link = sub_true_link[0]['id']
                count_linked += 1
            else:
                sub_true_link = None
                count_unlinked += 1
            ob_true_link = requests.get(API_ENDPOINT, params = {
                        'action': 'wbsearchentities',
                        'format': 'json',
                        'language': 'en',
                        'search': extraction['object']
                    }).json()['search']
            if len(ob_true_link) > 0:
                ob_true_link = ob_true_link[0]['id']
                count_linked += 1
            else:
                ob_true_link = None
                count_unlinked += 1
            print('sub:', extraction['subject'], sub_true_link, 'obj:', extraction['object'], ob_true_link)
            triple = {
                'id_': id_,
                'triple': [
                    extraction['subject'], 
                    extraction['relation'], 
                    extraction['object'][0]
                ],
                'triple_norm': [
                    ' '.join([wordnet_lemmatizer.lemmatize(word) for word in extraction['subject'].split()]), 
                    ' '.join([wordnet_lemmatizer.lemmatize(word) for word in extraction['relation'].split()]), 
                    ' '.join([wordnet_lemmatizer.lemmatize(word) for word in extraction['object'][0].split()]), 
                ],
                'true_link': {
                    'subject': sub_true_link,
                    'object': ob_true_link

                },
                'src_sentences': [sentence],
                'entity_linking': {},
                'kbp_info': []
            }
            id_ += 1
            cesi_triples.append(triple)
    write_text('\n'.join([json.dumps(triple) for triple in cesi_triples]), filename)
    print('linked:', count_linked, 'unlinked:', count_unlinked)

In [17]:
canonicals = {
    'hostel student': ['resident student', 'resident students', 'hostel student', 'hostel students', 'hostel resident', 'hostel residents', 'hosteller', 'hostellers'],
    'student': ['student', 'students'],
    'instructor': ['instructor', 'professor', 'faculty'],
    'campus': ['campus', 'on campus', 'in campus', 'inside campus', 'iiitd', 'in iiitd', 'inside iiitd', 'iiitd campus', 'in iiitd campus', 'inside iiitd campus', 'in college', 'inside college']
}

triples = read_json('../data/ollie_triples.json')
prepare_cesi_triples(triples, '../data/ollie_cesi_triples.txt')
# replace_predefined_canonicals(triples, canonicals)
# write_json(triples, '../data/ollie_canonicalised_1_triples.json')


# triples = read_json('../data/ollie_triples.json')
# cluster_similar_words(triples)
# write_json(triples, '../data/ollie_canonicalised_2_triples.json')

# triples = read_json('../data/ollie_triples.json')
# similar_edges = get_similar_to_edges(triples)
# print(similar_edges[:10])

sub: courier boys vehicle None obj: ['main parking'] None
sub: courier boys None obj: ['gf old building or reception'] None
sub: courier boys None obj: ['security guard'] Q856887
sub: courier boys None obj: ['no 1'] Q3346364
sub: courier boys None obj: ['delivery of courier'] None
sub: food items Q39342213 obj: ['delivery boy'] Q848466
sub: food items Q39342213 obj: ['reception'] Q31948
sub: food delivery boys vehicle None obj: ['main parking'] None
sub: food delivery boys None obj: ['reception'] Q31948
sub: food delivery boys None obj: ['delivery of food'] None
sub: food items Q39342213 obj: ['delivery boy'] Q848466
sub: visitor entry None obj: ['8'] Q23355
sub: access and no courier boy details None obj: ['provided register'] None
sub: visitor Q830719 obj: ['building complex'] Q1497364
sub: visitors entering campus None obj: ['cctv surveillance and number plate'] None
sub: campus Q209465 obj: ['self-driven cars'] Q58078234
sub: visitor coming to building complex in chauffeur driven c