In [2]:
import os, logging, pickle
from operator import itemgetter
import numpy as np
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
entity_ranking_op_folder = '../../../Data/output/Entity_Ranking'
type_rep_folder = 'TypeRep/type_word'
type_words_folder = 'Type_Words'
entity_rep_folder = 'EntityRep'
entity_context_word_file = 'doc_entity_context_word.p'
type_word_file_name = 'tag_word.p'

word_emnbedding_pretrained_trained_on_corpus = '../../../word_embeddings/trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

entity_context_word_file = os.path.join(entity_ranking_op_folder, entity_rep_folder, entity_context_word_file)
type_word_file = os.path.join(entity_ranking_op_folder, type_rep_folder, type_words_folder, type_word_file_name)

In [4]:
tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others', 'ORG_Accused', 'ORG_Victim', 'ORG_Others', 'PER_Victim', 'PER_Others', 'PER_Accused']
per_tag_list = ['PER_Victim', 'PER_Others', 'PER_Accused']
loc_tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others']
org_tag_list = ['ORG_Accused', 'ORG_Victim', 'ORG_Others']

In [8]:
entity_rep_context_word_file_dict = pickle.load(open(entity_context_word_file, 'rb'))
type_word_dict = pickle.load(open(type_word_file, 'rb'))
for tag in tag_list:
    print(tag, type_word_dict[tag], '\n')

LOC_Event [('near', 0.7774462699890137), ('A', 0.776093065738678), ('November', 0.7695729732513428), ('The', 0.7642201781272888), ('at', 0.7540594339370728), ('the', 0.7535964250564575), ('OPEN', 0.7424975633621216), ('attack', 0.7300781607627869), ('Monday', 0.728710412979126), ('on', 0.7264678478240967), ('Nadapuram', 0.7175939083099365), ('a', 0.713057816028595), ('people', 0.7122456431388855), ('city', 0.7111835479736328), ('bomb', 0.7110673189163208), ('in', 0.7069975733757019), ('Friday', 0.6984444856643677), ('case', 0.6968221664428711), ('Court', 0.6948869228363037), ('Masjid', 0.6918230056762695), ('blast', 0.6842758655548096), (',', 0.681684136390686), ('College', 0.6808955669403076), ('AP', 0.6805020570755005), ('Tuesday', 0.680214524269104), ("'s", 0.6764810085296631), ('around', 0.6760590672492981), ('three', 0.6759741306304932), ('and', 0.6748992800712585), ('least', 0.6737691164016724)] 

LOC_Accused [('completing', 0.2542300224304199), ('completion', 0.23934081196784973

In [9]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)
word_vector = word_vectors.get_vector('Narendra')
size = len(word_vector)

2018-05-01 20:31:17,169 : INFO : loading projection weights from ../../../word_embeddings/trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-01 20:33:44,776 : INFO : loaded (410894, 300) matrix from ../../../word_embeddings/trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [12]:
def group_average(entity, role_tuple):
    num_of_words_in_entity = len(entity)
    num_of_words_in_role = len(role)
    total_number_of_words = num_of_words_in_entity + num_of_words_in_role
    
    sum_of_entity_type_word_vec = np.zeros(size)
    for word in entity:
        word_vec = word_vectors.get_vector(word)
        word_vec = word_vec/np.linalg.norm(word_vec)
        sum_of_entity_type_word_vec = np.add(sum_of_entity_type_word_vec, word_vec)
    
    for tup in role_tuple:
        word_vec = word_vectors.get_vector(tup[0])
        word_vec = word_vec/np.linalg.norm(word_vec)
        sum_of_entity_type_word_vec = np.add(sum_of_entity_type_word_vec, word_vec)
        
    dot_product = np.dot(sum_of_entity_type_word_vec, sum_of_entity_type_word_vec)
    group_avg = float(dot_product - (total_number_of_words))/(total_number_of_words * (total_number_of_words - 1))
    return group_avg    
    

In [16]:
precision1 = dict()
doc_count_for_tag = dict()

for tag in tag_list:
    doc_count_for_tag[tag] = 0
    precision1[tag] = 0
    

# Precision at K
K = 1
count = 0
for doc_id in entity_rep_context_word_file_dict:
    print(doc_id)
    num_actual_entities_with_role = dict()
    if count > 0:
        break
    per_entities = list()
    loc_entities = list()
    org_entities = list()
    doc_role_dict = entity_rep_context_word_file_dict[doc_id]
    for role in doc_role_dict.keys():
        entities = doc_role_dict[role]
        num_actual_entities_with_role[role] = len(entities)
        if role in per_tag_list:
            for entity in entities:
                per_entities.append((entity, role))
        if role in org_tag_list:
            for entity in entities:
                org_entities.append((entity, role))
        if role in loc_tag_list:
            for entity in entities:
                loc_entities.append((entity, role))
    
    num_per_entity = len(per_entities)
    num_org_entity = len(org_entities)
    num_loc_entity = len(loc_entities)
    
    for role in per_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in per_entities:
                sim = 2
                sim = group_average(entity[0], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            for i in range(min(K, num_per_entity)):
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_per_entity)
                
    for role in loc_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in loc_entities:
                sim = 2
                sim = group_average(entity[0], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            for i in range(min(K, num_loc_entity)):
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_loc_entity)
                
    for role in org_tag_list:
        role_word = type_word_dict[role]
        retrivedResult = list()
        if num_actual_entities_with_role[role] != 0:
            TP = 0
            doc_count_for_tag[role] = doc_count_for_tag[role] + 1
            for entity in org_entities:
                sim = 2
                sim = group_average(entity[0], role_word)
                retrivedResult.append((entity, sim))
            retrivedResult = sorted(retrivedResult,key=itemgetter(1), reverse=True)
            for i in range(min(K, num_org_entity)):
                
                #print(len(retrivedResult))
                #print(role)
                #print(num_actual_entities_with_role)
                if retrivedResult[i][0][1] == role:
                    TP = TP + 1
            if TP ==  num_actual_entities_with_role[role]:
                precision1[role] = precision1[role] + 1
            else:
                precision1[role] = precision1[role] + float(TP)/min(K, num_org_entity)
    #print(num_actual_entities_with_role)
    #count = 1

for tag in tag_list:
    if doc_count_for_tag[tag] > 0:
        print(tag, float(precision1[tag] * 100)/doc_count_for_tag[tag], '%')

ev_091_st_005.txt
ev_086_st_003.txt
ev_024_st_001.txt
ev_077_st_007.txt
ev_030_st_001.txt
2010_3_7_st-15.txt
ev_074_st_016.txt
ev_081_st_002.txt
2014_10_6_st-4004.txt
ev_077_st_009.txt
ev_082_st_010.txt
ev_073_st_005.txt
ev_039_st_003.txt
ev_085_st_019.txt
2012_10_18_st-822.txt
ev_019_st_003.txt
ev_066_st_002.txt
ev_071_st_003.txt
2012_2_17_st-51.txt
2013_7_3_st-16.txt
2010_7_14_st-567.txt
2010_6_22_st-450.txt
2011_2_11_st-101.txt
ev_086_st_013.txt
2012_11_27_st-531.txt
2013_4_18_st-778.txt
2011_6_18_st-512.txt
ev_084_st_016.txt
ev_019_st_034.txt
ev_019_st_026.txt
ev_076_st_001.txt
2013_2_24_st-2.txt
ev_072_st_007.txt
2011_6_6_st-21.txt
ev_013_st_001.txt
ev_092_st_006.txt
ev_086_st_037.txt
ev_021_st_005.txt
ev_018_st_001.txt
ev_019_st_028.txt
ev_064_st_015.txt
2013_9_27_st-689.txt
ev_051_st_002.txt
ev_077_st_021.txt
2013_5_23_st-40.txt
ev_019_st_013.txt
ev_082_st_014.txt
2012_5_11_st-576.txt
ev_075_st_019.txt
ev_086_st_031.txt
ev_058_st_005.txt
2010_12_10_st-64.txt
ev_085_st_009.txt
20

2010_4_28_st-370.txt
ev_035_st_011.txt
ev_039_st_015.txt
2010_11_9_st-595.txt
ev_039_st_010.txt
ev_038_st_001.txt
2012_3_14_st-46.txt
2011_1_9_st-454.txt
ev_064_st_027.txt
2010_4_3_st-376.txt
ev_008_st_001.txt
ev_076_st_004.txt
ev_076_st_003.txt
ev_075_st_008.txt
2010_8_21_st-267.txt
ev_035_st_012.txt
ev_037_st_001.txt
ev_094_st_016.txt
2012_12_27_st-7.txt
2014_12_4_st-6003.txt
2011_12_8_st-694.txt
ev_019_st_029.txt
2010_8_25_st-33.txt
ev_090_st_007.txt
ev_086_st_035.txt
2013_4_30_st-96.txt
ev_034_st_002.txt
ev_087_st_009.txt
ev_088_st_005.txt
2010_9_28_st-10.txt
ev_061_st_007.txt
ev_092_st_001.txt
ev_094_st_017.txt
ev_064_st_008.txt
ev_028_st_001.txt
2010_5_23_st-114.txt
2012_2_18_st-81.txt
ev_093_st_006.txt
2012_5_5_st-700.txt
ev_094_st_008.txt
ev_090_st_012.txt
ev_075_st_021.txt
ev_093_st_035.txt
2012_3_15_st-37.txt
ev_083_st_001.txt
2012_7_14_st-909.txt
ev_086_st_036.txt
ev_048_st_009.txt
ev_014_st_010.txt
ev_058_st_008.txt
ev_082_st_007.txt
ev_073_st_007.txt
ev_064_st_001.txt
2010

ev_085_st_013.txt
2013_4_17_st-15.txt
ev_085_st_018.txt
2012_11_30_st-777.txt
ev_015_st_010.txt
ev_093_st_015.txt
ev_060_st_003.txt
ev_077_st_005.txt
2012_2_12_st-650.txt
ev_019_st_020.txt
ev_066_st_022.txt
2014_10_13_st-3.txt
2011_9_23_st-313.txt
2013_4_18_st-52.txt
ev_061_st_017.txt
2011_10_2_st-422.txt
2013_4_19_st-1091.txt
2011_9_9_st-44.txt
ev_062_st_006.txt
ev_059_st_007.txt
2013_6_26_st-36.txt
ev_086_st_007.txt
2010_6_18_st-6.txt
ev_083_st_035.txt
ev_062_st_007.txt
ev_070_st_017.txt
ev_086_st_018.txt
ev_089_st_023.txt
ev_093_st_009.txt
2011_1_21_st-764.txt
2011_8_13_st-46.txt
ev_089_st_008.txt
2013_4_3_st-489.txt
ev_048_st_006.txt
ev_051_st_004.txt
ev_086_st_024.txt
ev_094_st_025.txt
ev_019_st_002.txt
ev_049_st_003.txt
ev_037_st_006.txt
ev_081_st_013.txt
ev_072_st_018.txt
ev_090_st_008.txt
ev_089_st_038.txt
ev_078_st_008.txt
ev_063_st_005.txt
2013_1_11_st-22.txt
ev_073_st_009.txt
ev_011_st_003.txt
ev_019_st_041.txt
ev_083_st_016.txt
ev_093_st_014.txt
ev_081_st_014.txt
2010_5_20_