#### This code Tests following configuration:

Entity Representation: Doc2vec of Context words + Words in Entity

Type Representation: Type representation learned on training corpus

Ranking Score: Cosine distance

Word Representation: Trained word vectors on corpus initialized with pretrained glove vector embeddings

In [7]:
import os, pickle
from operator import itemgetter
import numpy as np

In [8]:
entity_ranking_op_folder = '../../Data/output'
type_rep_folder = 'TypeRep/type_word/train'
entity_rep_test_folder = 'EntityRep/test'
entity_rep_file_name = 'doc_role_entity_context_word_doc2vec_dict.p' 
type_rep_file_name = 'tag_vec_dict.p'

entity_rep_file = os.path.join(entity_ranking_op_folder, entity_rep_test_folder, entity_rep_file_name)
type_rep_file = os.path.join(entity_ranking_op_folder, type_rep_folder, type_rep_file_name)

In [9]:
tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others', 'ORG_Accused', 'ORG_Victim', 'ORG_Others', 'PER_Victim', 'PER_Others', 'PER_Accused']
per_tag_list = ['PER_Victim', 'PER_Others', 'PER_Accused']
loc_tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others']
org_tag_list = ['ORG_Accused', 'ORG_Victim', 'ORG_Others']

In [10]:
entity_rep_doc_role_context_dict = pickle.load(open(entity_rep_file, 'rb'))
type_rep_dict = pickle.load(open(type_rep_file, 'rb'))

In [11]:
def cosine_distance(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    cosine_sim = dot_product / (norm_a * norm_b)
    return 1-cosine_sim

In [12]:
precision1 = dict()
doc_count_for_tag = dict()

for tag in tag_list:
    doc_count_for_tag[tag] = 0
    precision1[tag] = 0
    

# Precision at K
K = 3
count = 0
for doc_id in entity_rep_doc_role_context_dict:
    #print(doc_id)
    num_actual_entities_with_role = dict()
    if count > 0:
        break
    per_entities = list()
    loc_entities = list()
    org_entities = list()
    doc_role_dict = entity_rep_doc_role_context_dict[doc_id]
    for role in doc_role_dict.keys():
        entities = doc_role_dict[role]
        num_actual_entities_with_role[role] = len(entities)
        if role in per_tag_list:
            for entity in entities:
                per_entities.append((entity, role))
        if role in org_tag_list:
            for entity in entities:
                org_entities.append((entity, role))
        if role in loc_tag_list:
            for entity in entities:
                loc_entities.append((entity, role))
    
    num_per_entity = len(per_entities)
    num_org_entity = len(org_entities)
    num_loc_entity = len(loc_entities)
    
    for role in per_tag_list:
        role_vector = type_rep_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in per_entities:
                sim = 2
                sim = cosine_distance(entity[0], role_vector)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1))
            for i in range(min(K, num_per_entity)):
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_per_entity)
                
    for role in loc_tag_list:
        role_vector = type_rep_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in loc_entities:
                sim = 2
                sim = cosine_distance(entity[0], role_vector)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1))
            for i in range(min(K, num_loc_entity)):
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_loc_entity)
                
    for role in org_tag_list:
        role_vector = type_rep_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in org_entities:
                sim = 2
                sim = cosine_distance(entity[0], role_vector)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1))
            for i in range(min(K, num_org_entity)):
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_org_entity)

for tag in tag_list:
    if doc_count_for_tag[tag] > 0:
        print(tag, float(precision1[tag] * 100)/doc_count_for_tag[tag], '%')

LOC_Event 56.261022927689595 %
LOC_Accused 26.041666666666664 %
LOC_Victim 36.11111111111111 %
LOC_Others 67.1666666666667 %
ORG_Accused 61.162079510703386 %
ORG_Victim 51.111111111111114 %
ORG_Others 84.33333333333331 %
PER_Victim 63.541666666666664 %
PER_Others 83.90151515151511 %
PER_Accused 66.13756613756613 %


In [41]:
from operator import itemgetter
data = [('abc', 1.21),('abc', 2.31),('abc', 1.48), ('abc',2.21)]
data = sorted(data,key=itemgetter(1), reverse=True)
data[0:4]

[('abc', 2.31), ('abc', 2.21), ('abc', 1.48), ('abc', 1.21)]