This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Reads the word vector of the words from trained word emdeddings
3. Takes the centroid(average) of word vectors
4. Stores back them in dictionary 

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is centroid

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'EntityRep/train'
test_entity_rep_output_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'
entity_doc_level_context_word_file = 'doc_entity_doc_Level_context_word.p'

doc_level = True
sent_level = False


word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [4]:
# Load the pickle file

if sent_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))

if doc_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))

In [5]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-11 04:22:59,637 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-11 04:25:10,499 : INFO : loaded (408497, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [6]:
word_vector = word_vectors.get_vector('Narendra')
size = len(word_vector)
#print(type(word_vector))
word_vectors.distance('Jammu', 'Narendra')

0.8915391432210585

In [7]:
def build_doc_role_entity_centroid_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_centroid_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_centroid_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_centroid_list = list()
            for entity in entity_list:
                num_of_words_in_entity = len(entity[1])
                entity_centroid = numpy.zeros(size)
                for word in entity[1]:
                    try:
                        entity_centroid = numpy.add(entity_centroid, word_vectors.get_vector(word))
                    except KeyError:
                        num_of_words_in_entity = num_of_words_in_entity - 1
                        
                entity_centroid = entity_centroid/num_of_words_in_entity
                entity_centroid_list.append((entity[0],entity_centroid))
            tag_centroid_dict[tag] = entity_centroid_list
        doc_role_entity_context_word_centroid_dict[doc] = tag_centroid_dict
        
    return doc_role_entity_context_word_centroid_dict

In [8]:
train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(test_doc_role_entity_context_word_dict)

In [9]:
if sent_level:
    pickle.dump(train_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/train/doc_role_entity_context_word_centroid.p', 'wb'))
    pickle.dump(test_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_centroid.p', 'wb'))
    
if doc_level:
    pickle.dump(train_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/train/doc_role_entity_doc_level_context_word_centroid.p', 'wb'))
    pickle.dump(test_doc_role_entity_context_word_centroid_dict, open('../../Data/output/EntityRep/test/doc_role_entity_doc_level_context_word_centroid.p', 'wb'))