This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Train the Doc2Vec model Store the trained model
3. Generate Doc2Vec representation for Entities at Sentence Level
4. Generate Doc2Vec representation for Entities at Doc Level

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is doc2vec

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors, doc2vec
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'EntityRep/train'
test_entity_rep_output_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'
entity_doc_level_context_word_file = 'doc_entity_doc_Level_context_word.p'

doc_level = True
sent_level = False

word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [3]:
# Load the pickle file

if sent_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))
    
if doc_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-11 04:29:41,073 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-11 04:32:20,158 : INFO : loaded (408497, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [5]:
def read_entities_build_corpus(doc_role_entity_context_word_dict, tokens_only=False):
    entities_corpus = list()
    
    for doc in doc_role_entity_context_word_dict.keys():
       
        tag_dict = doc_role_entity_context_word_dict[doc]
        
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            for entity in entity_list:
                if tokens_only:
                    yield(entity[1])
                else:
                    yield doc2vec.TaggedDocument(entity[1], [doc+'_'+entity[0]])
                    
                

In [6]:
train_corpus = list(read_entities_build_corpus(train_doc_role_entity_context_word_dict))
train_corpus[:2]

[TaggedDocument(words=['used', 'to', 'attach', 'the', 'device', 'to', 'the', 'car', 'in', 'which', 'Tal', 'Yehoshua', 'Koren', ',', 'an', 'Embassy', 'official', 'and', 'wife', 'of', 'Israel', "'s", 'defence'], tags=['2012_4_23_st-588.txt_Tal_Yehoshua_Koren']),
 TaggedDocument(words=['The', 'Delhi', 'Police', 'have', 'received', 'a', 'report', 'from', 'the', 'Central', 'Bureau', 'of', 'Investigation', "'s", 'Central', 'Forensic', 'Science', 'Laboratory', 'stating', 'that', 'trinitrotoluene', '(', 'TNT', ')', 'had', 'been', 'used', 'in'], tags=["2012_4_23_st-588.txt_Central_Bureau_of_Investigation_'s_Central_Forensic_Science_Laboratory"])]

In [7]:
test_corpus = list(read_entities_build_corpus(test_doc_role_entity_context_word_dict, tokens_only=True))
print(test_corpus[:2])

[['that', 'carried', 'out', 'a', 'blast', 'in', 'the', 'German', 'Bakery', 'in', 'Pune', 'last', 'year', '.', '.', '.', 'He', 'said', 'that', 'while', 'there', 'one', 'religion', ',', "''", 'Chidambaram', 'said', '.', '.', '.', 'The', 'Pune', 'and', 'Mumbai', 'blasts', 'are', '``', 'two', 'major', 'blots', 'on', 'my'], ['Home', 'Minister', 'P', 'Chidambaram', 'on', 'Thursday', 'said', 'there', 'are', 'indications', 'of', 'involvement', 'of', 'Indian', 'module', 'in']]


#### Train the model with pretrained word2vec

In [8]:
model = doc2vec.Doc2Vec(vector_size=300, min_count=1, epochs=100, pretrained_emb=word_emnbedding_pretrained_trained_on_corpus)
model.build_vocab(train_corpus)
#training_examples_count = model.corpus_count
#model.build_vocab([list(word_vectors.vocab.keys())], update=True)

#model.intersect_word2vec_format(word_emnbedding_pretrained_trained_on_corpus,binary=False, lockf=1.0)

2018-05-11 04:32:20,556 : INFO : collecting all words and their counts
2018-05-11 04:32:20,557 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-11 04:32:20,717 : INFO : PROGRESS: at example #10000, processed 556274 words (3499418/s), 11111 word types, 6913 tags
2018-05-11 04:32:20,899 : INFO : collected 15561 word types and 13383 unique tags from a corpus of 19366 examples and 1305682 words
2018-05-11 04:32:20,901 : INFO : Loading a fresh vocabulary
2018-05-11 04:32:20,960 : INFO : min_count=1 retains 15561 unique words (100% of original 15561, drops 0)
2018-05-11 04:32:20,961 : INFO : min_count=1 leaves 1305682 word corpus (100% of original 1305682, drops 0)
2018-05-11 04:32:21,062 : INFO : deleting the raw counts dictionary of 15561 items
2018-05-11 04:32:21,065 : INFO : sample=0.001 downsamples 37 most-common words
2018-05-11 04:32:21,066 : INFO : downsampling leaves estimated 945671 word corpus (72.4% of prior 1305682)
2018-05-11 04:32:21,169 

In [9]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2018-05-11 04:32:21,908 : INFO : training model with 3 workers on 15561 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-05-11 04:32:23,036 : INFO : EPOCH 1 - PROGRESS: at 35.71% examples, 267451 words/s, in_qsize 4, out_qsize 1
2018-05-11 04:32:24,072 : INFO : EPOCH 1 - PROGRESS: at 65.04% examples, 289918 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:32:25,077 : INFO : EPOCH 1 - PROGRESS: at 90.88% examples, 295886 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:32:25,249 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:32:25,256 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:32:25,281 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:32:25,282 : INFO : EPOCH - 1 : training on 1305682 raw words (964889 effective words) took 3.3s, 295499 effective words/s
2018-05-11 04:32:26,305 : INFO : EPOCH 2 - PROGRESS: at 36.89% examples, 283570 words/s, in_qsize 5, o

2018-05-11 04:32:57,932 : INFO : EPOCH - 12 : training on 1305682 raw words (965476 effective words) took 2.8s, 349184 effective words/s
2018-05-11 04:32:58,973 : INFO : EPOCH 13 - PROGRESS: at 43.44% examples, 357334 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:00,018 : INFO : EPOCH 13 - PROGRESS: at 66.99% examples, 350002 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:01,024 : INFO : EPOCH 13 - PROGRESS: at 94.45% examples, 301090 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:01,116 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:33:01,142 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:33:01,160 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:33:01,161 : INFO : EPOCH - 13 : training on 1305682 raw words (965328 effective words) took 3.2s, 299537 effective words/s
2018-05-11 04:33:02,210 : INFO : EPOCH 14 - PROGRESS: at 33.19% examples, 239890 words/s, in_qsize 5, out_qsize

2018-05-11 04:33:37,264 : INFO : EPOCH 24 - PROGRESS: at 36.89% examples, 281311 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:38,279 : INFO : EPOCH 24 - PROGRESS: at 65.13% examples, 300845 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:39,280 : INFO : EPOCH 24 - PROGRESS: at 92.94% examples, 303014 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:39,406 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:33:39,423 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:33:39,432 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:33:39,434 : INFO : EPOCH - 24 : training on 1305682 raw words (965437 effective words) took 3.2s, 302199 effective words/s
2018-05-11 04:33:40,444 : INFO : EPOCH 25 - PROGRESS: at 35.71% examples, 270723 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:41,445 : INFO : EPOCH 25 - PROGRESS: at 64.80% examples, 276400 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:33:42,

2018-05-11 04:34:13,268 : INFO : EPOCH 35 - PROGRESS: at 37.05% examples, 292955 words/s, in_qsize 6, out_qsize 0
2018-05-11 04:34:14,274 : INFO : EPOCH 35 - PROGRESS: at 65.30% examples, 309747 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:15,280 : INFO : EPOCH 35 - PROGRESS: at 95.80% examples, 307313 words/s, in_qsize 4, out_qsize 0
2018-05-11 04:34:15,342 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:34:15,352 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:34:15,353 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:34:15,353 : INFO : EPOCH - 35 : training on 1305682 raw words (965121 effective words) took 3.1s, 309288 effective words/s
2018-05-11 04:34:16,381 : INFO : EPOCH 36 - PROGRESS: at 35.71% examples, 267271 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:17,382 : INFO : EPOCH 36 - PROGRESS: at 64.72% examples, 268683 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:18,

2018-05-11 04:34:51,356 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:34:51,372 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:34:51,377 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:34:51,380 : INFO : EPOCH - 46 : training on 1305682 raw words (965437 effective words) took 2.8s, 342291 effective words/s
2018-05-11 04:34:52,397 : INFO : EPOCH 47 - PROGRESS: at 35.71% examples, 270448 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:53,400 : INFO : EPOCH 47 - PROGRESS: at 64.62% examples, 266472 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:54,410 : INFO : EPOCH 47 - PROGRESS: at 76.61% examples, 267304 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:34:55,037 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:34:55,042 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:34:55,051 : INFO : worker thread finished; awaiting 

2018-05-11 04:35:28,105 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:35:28,115 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:35:28,116 : INFO : EPOCH - 57 : training on 1305682 raw words (964851 effective words) took 3.0s, 321682 effective words/s
2018-05-11 04:35:29,145 : INFO : EPOCH 58 - PROGRESS: at 37.05% examples, 296033 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:35:30,152 : INFO : EPOCH 58 - PROGRESS: at 65.42% examples, 322838 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:35:31,137 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:35:31,148 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:35:31,156 : INFO : EPOCH 58 - PROGRESS: at 100.00% examples, 318019 words/s, in_qsize 0, out_qsize 1
2018-05-11 04:35:31,156 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:35:31,157 : INFO : EPOCH - 58 : training on 1305682

2018-05-11 04:36:03,402 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:36:03,403 : INFO : EPOCH - 68 : training on 1305682 raw words (964699 effective words) took 3.4s, 286610 effective words/s
2018-05-11 04:36:04,426 : INFO : EPOCH 69 - PROGRESS: at 33.19% examples, 247017 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:36:05,432 : INFO : EPOCH 69 - PROGRESS: at 64.83% examples, 277875 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:36:06,436 : INFO : EPOCH 69 - PROGRESS: at 87.86% examples, 292251 words/s, in_qsize 6, out_qsize 0
2018-05-11 04:36:06,727 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:36:06,736 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:36:06,748 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:36:06,749 : INFO : EPOCH - 69 : training on 1305682 raw words (964818 effective words) took 3.3s, 289344 effective words/s
2018-05-11 04:36:07,

2018-05-11 04:36:39,904 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:36:39,905 : INFO : EPOCH - 79 : training on 1305682 raw words (965319 effective words) took 3.1s, 310330 effective words/s
2018-05-11 04:36:40,920 : INFO : EPOCH 80 - PROGRESS: at 37.05% examples, 299976 words/s, in_qsize 6, out_qsize 0
2018-05-11 04:36:41,923 : INFO : EPOCH 80 - PROGRESS: at 65.40% examples, 322883 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:36:42,878 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:36:42,910 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:36:42,911 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:36:42,912 : INFO : EPOCH - 80 : training on 1305682 raw words (964726 effective words) took 3.0s, 321300 effective words/s
2018-05-11 04:36:43,922 : INFO : EPOCH 81 - PROGRESS: at 37.05% examples, 302567 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:36:44,

2018-05-11 04:37:16,235 : INFO : EPOCH 91 - PROGRESS: at 31.85% examples, 241719 words/s, in_qsize 6, out_qsize 0
2018-05-11 04:37:17,239 : INFO : EPOCH 91 - PROGRESS: at 63.34% examples, 250681 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:37:18,260 : INFO : EPOCH 91 - PROGRESS: at 80.37% examples, 274015 words/s, in_qsize 6, out_qsize 0
2018-05-11 04:37:18,693 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 04:37:18,718 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 04:37:18,730 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 04:37:18,731 : INFO : EPOCH - 91 : training on 1305682 raw words (964674 effective words) took 3.5s, 275571 effective words/s
2018-05-11 04:37:19,740 : INFO : EPOCH 92 - PROGRESS: at 35.73% examples, 272206 words/s, in_qsize 5, out_qsize 0
2018-05-11 04:37:20,744 : INFO : EPOCH 92 - PROGRESS: at 64.83% examples, 279261 words/s, in_qsize 4, out_qsize 0
2018-05-11 04:37:21,

CPU times: user 10min 47s, sys: 20.5 s, total: 11min 8s
Wall time: 5min 27s


In [13]:
def build_doc_role_entity_doc2vec_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_doc2vec_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        #print(doc)
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_doc2vec_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_doc2vec_list = list()
            for entity in entity_list:
                entity_doc2vec_list.append((entity[0], model.infer_vector(entity[1])))
            tag_doc2vec_dict[tag] = entity_doc2vec_list
        doc_role_entity_context_word_doc2vec_dict[doc] = tag_doc2vec_dict
        
    return doc_role_entity_context_word_doc2vec_dict

In [14]:
#train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_doc2vec_dict = build_doc_role_entity_doc2vec_dictionary(test_doc_role_entity_context_word_dict)

In [15]:
if sent_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_doc2vec_dict.p', 'wb'))

if doc_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_doc_level_context_word_doc2vec_dict.p', 'wb'))