This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Reads the word vector of the words from trained word emdeddings
3. Takes the centroid(average) of word vectors
4. Stores back them in dictionary 

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is centroid

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'Entity_Ranking/EntityRep/train'
test_entity_rep_output_folder = 'Entity_Ranking/EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'


word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [3]:
# Load the pickle file
train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-03 00:22:05,847 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-03 00:24:11,758 : INFO : loaded (408497, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [5]:
word_vector = word_vectors.get_vector('Narendra')
size = len(word_vector)
#print(type(word_vector))
word_vectors.distance('Jammu', 'Narendra')

0.8915391432210585

In [9]:
def build_doc_role_entity_centroid_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_centroid_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        print(doc)
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_centroid_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_centroid_list = list()
            for entity in entity_list:
                num_of_words_in_entity = len(entity)
                entity_centroid = numpy.zeros(size)
                for word in entity:
                    try:
                        entity_centroid = numpy.add(entity_centroid, word_vectors.get_vector(word))
                    except KeyError:
                        num_of_words_in_entity = num_of_words_in_entity - 1
                        
                entity_centroid = entity_centroid/num_of_words_in_entity
                entity_centroid_list.append(entity_centroid)
            tag_centroid_dict[tag] = entity_centroid_list
        doc_role_entity_context_word_centroid_dict[doc] = tag_centroid_dict
        
    return doc_role_entity_context_word_centroid_dict

In [10]:
train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(test_doc_role_entity_context_word_dict)

ev_089_st_030.txt
2012_11_7_st-761.txt
2011_12_26_st-62.txt
2013_2_22_st-438.txt
ev_039_st_010.txt
2011_10_2_st-422.txt
ev_080_st_005.txt
ev_039_st_001.txt
2011_9_10_st-536.txt
2012_2_2_st-85.txt
ev_062_st_008.txt
ev_015_st_001.txt
ev_035_st_013.txt
2010_7_14_st-41.txt
ev_086_st_034.txt
2012_6_28_st-97.txt
ev_094_st_012.txt
2014_10_9_st-4003.txt
ev_003_st_002.txt
2013_4_18_st-375.txt
ev_022_st_002.txt
ev_048_st_006.txt
2011_7_15_st-18.txt
2011_6_21_st-55.txt
ev_035_st_009.txt
ev_011_st_002.txt
ev_075_st_011.txt
ev_087_st_016.txt
ev_015_st_014.txt
ev_076_st_003.txt
2013_5_18_st-88.txt
ev_079_st_001.txt
2013_7_3_st-16.txt
2010_9_23_st-549.txt
ev_085_st_001.txt
ev_034_st_001.txt
ev_094_st_021.txt
2013_5_23_st-349.txt
ev_049_st_003.txt
ev_019_st_027.txt
ev_077_st_010.txt
2012_10_6_st-100.txt
ev_089_st_041.txt
ev_064_st_013.txt
ev_072_st_007.txt
2011_1_9_st-454.txt
ev_042_st_006.txt
ev_080_st_001.txt
ev_081_st_002.txt
ev_040_st_002.txt
ev_019_st_042.txt
ev_084_st_004.txt
ev_082_st_003.txt
2

ev_034_st_003.txt
ev_092_st_022.txt
ev_066_st_011.txt
ev_078_st_010.txt
2013_9_6_st-763.txt
2013_7_2_st-26.txt
ev_035_st_001.txt
ev_039_st_006.txt
ev_072_st_012.txt
ev_030_st_002.txt
ev_019_st_049.txt
ev_087_st_015.txt
ev_066_st_018.txt
2012_2_18_st-81.txt
ev_082_st_016.txt
2011_8_26_st-5.txt
ev_066_st_005.txt
ev_093_st_031.txt
2011_9_8_st-4.txt
ev_082_st_010.txt
2011_3_17_st-317.txt
ev_094_st_024.txt
ev_093_st_025.txt
ev_019_st_016.txt
2013_4_27_st-79.txt
ev_068_st_003.txt
ev_077_st_008.txt
2013_4_16_st-27.txt
ev_083_st_019.txt
2011_10_30_st-99.txt
2010_6_18_st-6.txt
ev_028_st_003.txt
2012_2_19_st-673.txt
ev_087_st_022.txt
ev_087_st_019.txt
2013_9_28_st-648.txt
ev_049_st_002.txt
2010_12_22_st-19.txt
ev_064_st_024.txt
ev_066_st_022.txt
ev_045_st_006.txt
2010_11_26_st-345.txt
ev_083_st_025.txt
ev_094_st_015.txt
ev_081_st_001.txt
ev_077_st_019.txt
ev_092_st_005.txt
2011_2_22_st-442.txt
ev_019_st_032.txt
2011_12_8_st-694.txt
ev_019_st_026.txt
ev_084_st_028.txt
ev_092_st_024.txt
2014_10_13

2010_5_1_st-420.txt
ev_007_st_001.txt
ev_063_st_007.txt
ev_063_st_009.txt
2012_2_15_st-16.txt
ev_068_st_005.txt
2012_10_4_st-750.txt
ev_089_st_003.txt
2013_8_29_st-76.txt
ev_062_st_002.txt
ev_070_st_010.txt
ev_070_st_008.txt
ev_025_st_004.txt
ev_086_st_019.txt
ev_075_st_001.txt
ev_059_st_006.txt
ev_012_st_001.txt
ev_089_st_017.txt
2010_3_2_st-12.txt
ev_019_st_014.txt
2011_9_21_st-21.txt
ev_048_st_004.txt
ev_094_st_019.txt
ev_063_st_001.txt
2010_6_5_st-12.txt
ev_036_st_001.txt
ev_070_st_012.txt
ev_070_st_005.txt
ev_085_st_009.txt
ev_090_st_004.txt
ev_019_st_018.txt
ev_066_st_017.txt
2013_6_12_st-775.txt
ev_074_st_001.txt
ev_037_st_005.txt
2013_3_5_st-713.txt
2010_9_14_st-409.txt
ev_015_st_015.txt
2010_12_14_st-26.txt
ev_039_st_003.txt
ev_084_st_025.txt
ev_094_st_004.txt
ev_067_st_001.txt
ev_077_st_016.txt
2012_5_2_st-55.txt
ev_076_st_017.txt
ev_026_st_002.txt
2010_11_7_st-634.txt
2010_6_15_st-364.txt
2013_3_2_st-498.txt
2012_12_27_st-7.txt
2012_1_15_st-5.txt
ev_080_st_003.txt
ev_083_st_

In [11]:
pickle.dump(train_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/train/doc_role_entity_context_word_centroid.p', 'wb'))
pickle.dump(test_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_centroid.p', 'wb'))