#### This code Tests following configuration:

Entity Representation: Words in Entity and Context Words

Type Representation: Similar words to type representation learned on training corpus

Ranking Score: Group Average

Word Representation: Trained word vectors on corpus initialized with pretrained glove vector embeddings

In [1]:
import os, logging, pickle
from operator import itemgetter
import numpy as np
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [26]:
entity_ranking_op_folder = '../../Data/output'
type_rep_folder = 'TypeRep/type_word/train'
entity_rep_test_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'
entity_doc_level_context_word_file = 'doc_entity_doc_level_context_word.p'
type_word_file_name = 'tag_word_dict.p'

doc_level = True
sent_level = False

word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

entity_context_word_file = os.path.join(entity_ranking_op_folder, entity_rep_test_folder, entity_context_word_file)
entity_doc_level_context_word_file = os.path.join(entity_ranking_op_folder, entity_rep_test_folder, entity_doc_level_context_word_file)
type_word_file = os.path.join(entity_ranking_op_folder, type_rep_folder, type_word_file_name)

In [27]:
tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others', 'ORG_Accused', 'ORG_Victim', 'ORG_Others', 'PER_Victim', 'PER_Others', 'PER_Accused']
per_tag_list = ['PER_Victim', 'PER_Others', 'PER_Accused']
loc_tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others']
org_tag_list = ['ORG_Accused', 'ORG_Victim', 'ORG_Others']

In [28]:
if sent_level:
    entity_rep_context_word_file_dict = pickle.load(open(entity_context_word_file, 'rb'))
if doc_level:
    entity_rep_context_word_file_dict = pickle.load(open(entity_doc_level_context_word_file, 'rb'))
type_word_dict = pickle.load(open(type_word_file, 'rb'))
for tag in tag_list:
    print(tag, type_word_dict[tag], '\n')

LOC_Event [('298.8', 0.9452037215232849), ('friday', 0.9345641136169434), ('one', 0.9336049556732178), ('bomb', 0.9318419694900513), ('near', 0.9280843734741211), ('blast', 0.9222268462181091), ('monday', 0.9185738563537598), ('two', 0.9151285886764526), ('waste-papers', 0.9148021936416626), ('wednesday', 0.9094426035881042), ('riyas', 0.9089627861976624), ('people', 0.9072010517120361), ('brought', 0.9038133025169373), ('area', 0.9037970304489136), ('killed', 0.9035921692848206), ('blasts', 0.9032737016677856), ('attack', 0.9032616019248962), ('another', 0.9013210535049438), ('first', 0.8993592262268066), ('said', 0.8977240920066833), ('asimanand', 0.8961100578308105), ('time', 0.894812822341919), ('outside', 0.8946843147277832), ('explosion', 0.8944391012191772), ('day', 0.8940559029579163), ('just', 0.8935253620147705), ('khallikote', 0.8933696746826172), ('since', 0.8933420181274414), ('ttp-pakistan', 0.8924744129180908), ('went', 0.8914841413497925)] 

LOC_Accused [('asimanand', 0

In [5]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-25 06:34:57,024 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-25 06:37:04,361 : INFO : loaded (403308, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [6]:
word_vector = word_vectors.get_vector('narendra')
size = len(word_vector)

In [7]:
def group_average(entity, role_tuple):
    num_of_words_in_entity = len(entity)
    num_of_words_in_role = len(role)
    total_number_of_words = num_of_words_in_entity + num_of_words_in_role
    
    sum_of_entity_type_word_vec = np.zeros(size)
    for word in entity:
        try:
            word_vec = word_vectors.get_vector(word)
            word_vec = word_vec/np.linalg.norm(word_vec)
        except KeyError:
            word_vec = np.zeros(size)
            
        sum_of_entity_type_word_vec = np.add(sum_of_entity_type_word_vec, word_vec)
    
    for tup in role_tuple:
        try:
            word_vec = word_vectors.get_vector(tup[0])
            word_vec = word_vec/np.linalg.norm(word_vec)
        except KeyError:
            word_vec = np.zeros(size)
        
        sum_of_entity_type_word_vec = np.add(sum_of_entity_type_word_vec, word_vec)
        
    dot_product = np.dot(sum_of_entity_type_word_vec, sum_of_entity_type_word_vec)
    group_avg = float(dot_product - (total_number_of_words))/(total_number_of_words * (total_number_of_words - 1))
    return group_avg    
    

In [10]:
''' This function removes duplicate entity,tag pair

#Example:
(('Mumbai', 'Event'), 8)
(('Mumbai', 'Others'), 7)
(('Mumbai', 'Event'), 5)
(('Mumbai', 'Others'), 4)
(('Mumbai', 'Others'), 2)
(('Mumbai', 'Event'), 1)

Output

(('Mumbai', 'Event'), 8)
(('Mumbai', 'Others'), 7)
'''
def removeDuplicatesFromResult(retrivedResult):
    newretrievedResult = list()
    retrievedResultSet = set()
    for result in retrivedResult:
        entity_tag_pair = result[0][0][0] +'_'+result[0][1]
        if entity_tag_pair in retrievedResultSet:
            pass
        else:
            retrievedResultSet.add(entity_tag_pair)
            newretrievedResult.append(result)
            #print(entity_tag_pair)
    return newretrievedResult

In [33]:
precision1 = dict()
doc_count_for_tag = dict()
AveP = dict()

for tag in tag_list:
    doc_count_for_tag[tag] = 0
    precision1[tag] = 0
    AveP[tag] = 0
    

# Precision at K
K = 5

for doc_id in entity_rep_context_word_file_dict:
    #print(doc_id)
    num_actual_entities_with_role = dict()
   
    per_entities = list()
    loc_entities = list()
    org_entities = list()
    doc_role_dict = entity_rep_context_word_file_dict[doc_id]
    for role in doc_role_dict.keys():
        entities = doc_role_dict[role]
        num_actual_entities_with_role[role] = len(entities)
        if role in per_tag_list:
            for entity in entities:
                per_entities.append((entity, role))
        if role in org_tag_list:
            for entity in entities:
                org_entities.append((entity, role))
        if role in loc_tag_list:
            for entity in entities:
                loc_entities.append((entity, role))
    
    num_per_entity = len(per_entities)
    num_org_entity = len(org_entities)
    num_loc_entity = len(loc_entities)
    
    for role in per_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            avp = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in per_entities:
                sim = 2
                sim = group_average(entity[0][1], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            
            #Remove duplicates and update number of entities
            retrivedResult = removeDuplicatesFromResult(retrivedResult)
            num_per_entity = len(retrivedResult)
            
            for i in range(min(K, num_per_entity)):
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
                    avp = avp + (float(TP)/(i+1))
            #if TP ==  num_actual_entities_with_role[role]:
             #   precision1[role] = precision1[role] + 1
            #else:
            precision1[role] = precision1[role] + float(TP)/min(K, num_per_entity)
            if TP != 0:
                AveP[role] = AveP[role] + float(avp)/TP
                
    for role in loc_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            avp = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in loc_entities:
                sim = 2
                sim = group_average(entity[0][1], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            
            #Remove duplicates and update number of entities
            retrivedResult = removeDuplicatesFromResult(retrivedResult)
            num_loc_entity = len(retrivedResult)
            
            for i in range(min(K, num_loc_entity)):
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
                    avp = avp + (float(TP)/(i+1))
            #if TP ==  num_actual_entities_with_role[role]:
             #   precision1[role] = precision1[role] + 1
            #else:
            precision1[role] = precision1[role] + float(TP)/min(K, num_loc_entity)
            if TP != 0:
                AveP[role] = AveP[role] + float(avp)/TP
                
    for role in org_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            avp = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in org_entities:
                sim = 2
                sim = group_average(entity[0][1], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            
            #Remove duplicates and update number of entities
            retrivedResult = removeDuplicatesFromResult(retrivedResult)
            num_org_entity = len(retrivedResult)
            
            for i in range(min(K, num_org_entity)):
                
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
                    avp = avp + (float(TP)/(i+1))
           # if TP ==  num_actual_entities_with_role[role]:
               # precision1[role] = precision1[role] + 1
            #else:
            precision1[role] = precision1[role] + float(TP)/min(K, num_org_entity)
            if TP != 0:
                AveP[role] = AveP[role] + float(avp)/TP
    #print(num_actual_entities_with_role)
    #count = 1

avg_pre = 0
mean_avg_pre = 0
for tag in tag_list:
    if tag[4:] != 'Others' and tag!= 'LOC_Victim':
        if doc_count_for_tag[tag] > 0:
            avg_pre = avg_pre + float(precision1[tag] * 100)/doc_count_for_tag[tag]
            mean_avg_pre = mean_avg_pre + float(AveP[tag] * 100)/doc_count_for_tag[tag]
        #print(tag, float(precision1[tag] * 100)/doc_count_for_tag[tag], '%')
            print(tag, float(AveP[tag] * 100)/doc_count_for_tag[tag], '%')
#print('Average', avg_pre/10, '%')

print('MAP not including others', mean_avg_pre/6, '%')

LOC_Event 63.94617074701818 %
LOC_Accused 46.97530864197531 %
ORG_Accused 58.76034278959812 %
ORG_Victim 63.277777777777786 %
PER_Victim 53.16595441595442 %
PER_Accused 63.79357298474945 %
MAP not including others 58.31985455951221 %
