This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Train the Doc2Vec model Store the trained model
3. Generate Doc2Vec representation for Entities at Sentence Level
4. Generate Doc2Vec representation for Entities at Doc Level

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is doc2vec

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors, doc2vec
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [4]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'EntityRep/train'
test_entity_rep_output_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'


word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [5]:
# Load the pickle file
train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))

In [20]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-03 18:29:41,838 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-03 18:32:02,578 : INFO : loaded (408497, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [17]:
def read_entities_build_corpus(doc_role_entity_context_word_dict, tokens_only=False):
    entities_corpus = list()
    
    for doc in doc_role_entity_context_word_dict.keys():
        count = 0
        tag_dict = doc_role_entity_context_word_dict[doc]
        
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            for entity in entity_list:
                if tokens_only:
                    yield(entity)
                else:
                    yield doc2vec.TaggedDocument(entity, [doc+'_'+str(count)])
                    count = count + 1
                

In [22]:
train_corpus = list(read_entities_build_corpus(train_doc_role_entity_context_word_dict))
train_corpus[:2]

[TaggedDocument(words=['the', 'CID', '-LRB-', 'Crime', '-RRB-', 'investigating', 'Saturdays', 'serial', 'blasts', 'in', 'Ahmedabad', '.', '.', '.', 'The', 'police', 'believe', 'that', 'the', 'diamond', 'and'], tags=['ev_077_st_022.txt_0']),
 TaggedDocument(words=['police', 'believe', 'that', 'the', 'diamond', 'and', 'art', 'silk', 'city', 'of', 'Surat', ',', 'the', 'second', 'major', 'commercial', 'centre', 'in', 'the', 'State', 'after'], tags=['ev_077_st_022.txt_1'])]

In [19]:
test_corpus = list(read_entities_build_corpus(test_doc_role_entity_context_word_dict, tokens_only=True))
print(test_corpus[:2])

[['Pune', ':', 'Pune', 'this', 'evening', 'saw', 'four', 'low-intensity', 'blasts', 'in', 'a', 'span', 'of'], ['in', 'a', 'span', 'of', 'nearly', '40', 'minutes', 'on', 'its', 'busy', 'Junglee', 'Maharaj', 'Road', 'in', 'the', 'heart', 'of', 'the', 'city', '.', 'Two', 'other', 'bombs']]


#### Train the model with pretrained word2vec

In [32]:
model = doc2vec.Doc2Vec(vector_size=300, min_count=1, epochs=100, pretrained_emb=word_emnbedding_pretrained_trained_on_corpus)
model.build_vocab(train_corpus)
#training_examples_count = model.corpus_count
#model.build_vocab([list(word_vectors.vocab.keys())], update=True)

#model.intersect_word2vec_format(word_emnbedding_pretrained_trained_on_corpus,binary=False, lockf=1.0)

2018-05-03 18:59:18,571 : INFO : collecting all words and their counts
2018-05-03 18:59:18,574 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-03 18:59:18,682 : INFO : PROGRESS: at example #10000, processed 212094 words (1991911/s), 10869 word types, 10000 tags
2018-05-03 18:59:18,762 : INFO : collected 15561 word types and 19366 unique tags from a corpus of 19366 examples and 410293 words
2018-05-03 18:59:18,763 : INFO : Loading a fresh vocabulary
2018-05-03 18:59:18,812 : INFO : min_count=1 retains 15561 unique words (100% of original 15561, drops 0)
2018-05-03 18:59:18,813 : INFO : min_count=1 leaves 410293 word corpus (100% of original 410293, drops 0)
2018-05-03 18:59:18,914 : INFO : deleting the raw counts dictionary of 15561 items
2018-05-03 18:59:18,916 : INFO : sample=0.001 downsamples 39 most-common words
2018-05-03 18:59:18,917 : INFO : downsampling leaves estimated 301757 word corpus (73.5% of prior 410293)
2018-05-03 18:59:18,986 : I

In [33]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2018-05-03 18:59:29,927 : INFO : training model with 3 workers on 15561 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-05-03 18:59:30,947 : INFO : EPOCH 1 - PROGRESS: at 70.55% examples, 226223 words/s, in_qsize 6, out_qsize 0
2018-05-03 18:59:31,271 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 18:59:31,298 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 18:59:31,326 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 18:59:31,327 : INFO : EPOCH - 1 : training on 410293 raw words (321129 effective words) took 1.4s, 232125 effective words/s
2018-05-03 18:59:32,415 : INFO : EPOCH 2 - PROGRESS: at 77.84% examples, 232088 words/s, in_qsize 6, out_qsize 0
2018-05-03 18:59:32,625 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 18:59:32,627 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 18:59:32,659 : INFO : w

2018-05-03 18:59:51,676 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 18:59:51,676 : INFO : EPOCH - 16 : training on 410293 raw words (320925 effective words) took 1.3s, 242614 effective words/s
2018-05-03 18:59:52,736 : INFO : EPOCH 17 - PROGRESS: at 68.10% examples, 208963 words/s, in_qsize 5, out_qsize 0
2018-05-03 18:59:53,081 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 18:59:53,086 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 18:59:53,138 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 18:59:53,140 : INFO : EPOCH - 17 : training on 410293 raw words (320850 effective words) took 1.5s, 220955 effective words/s
2018-05-03 18:59:54,175 : INFO : EPOCH 18 - PROGRESS: at 75.41% examples, 236543 words/s, in_qsize 6, out_qsize 0
2018-05-03 18:59:54,577 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 18:59:54,608 : INFO : worker threa

2018-05-03 19:00:13,684 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:00:13,699 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:00:13,701 : INFO : EPOCH - 32 : training on 410293 raw words (321137 effective words) took 1.5s, 213231 effective words/s
2018-05-03 19:00:14,748 : INFO : EPOCH 33 - PROGRESS: at 70.55% examples, 218595 words/s, in_qsize 5, out_qsize 0
2018-05-03 19:00:15,118 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:00:15,134 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:00:15,159 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:00:15,160 : INFO : EPOCH - 33 : training on 410293 raw words (321012 effective words) took 1.4s, 221541 effective words/s
2018-05-03 19:00:16,194 : INFO : EPOCH 34 - PROGRESS: at 72.98% examples, 228342 words/s, in_qsize 6, out_qsize 0
2018-05-03 19:00:16,527 : INFO : worker threa

2018-05-03 19:00:40,484 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:00:40,537 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:00:40,549 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:00:40,550 : INFO : EPOCH - 47 : training on 410293 raw words (321202 effective words) took 1.6s, 195480 effective words/s
2018-05-03 19:00:41,565 : INFO : EPOCH 48 - PROGRESS: at 46.20% examples, 147680 words/s, in_qsize 5, out_qsize 0
2018-05-03 19:00:42,563 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:00:42,612 : INFO : EPOCH 48 - PROGRESS: at 97.55% examples, 152585 words/s, in_qsize 1, out_qsize 1
2018-05-03 19:00:42,613 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:00:42,642 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:00:42,642 : INFO : EPOCH - 48 : training on 410293 raw words (321145 effecti

2018-05-03 19:01:03,604 : INFO : EPOCH - 62 : training on 410293 raw words (320981 effective words) took 1.5s, 215915 effective words/s
2018-05-03 19:01:04,663 : INFO : EPOCH 63 - PROGRESS: at 60.81% examples, 187362 words/s, in_qsize 6, out_qsize 0
2018-05-03 19:01:05,151 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:01:05,172 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:01:05,187 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:01:05,188 : INFO : EPOCH - 63 : training on 410293 raw words (321299 effective words) took 1.6s, 204748 effective words/s
2018-05-03 19:01:06,223 : INFO : EPOCH 64 - PROGRESS: at 68.12% examples, 213790 words/s, in_qsize 6, out_qsize 0
2018-05-03 19:01:06,614 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:01:06,647 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:01:06,664 : INFO : worker threa

2018-05-03 19:01:27,133 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:01:27,134 : INFO : EPOCH - 78 : training on 410293 raw words (321166 effective words) took 1.4s, 222737 effective words/s
2018-05-03 19:01:28,152 : INFO : EPOCH 79 - PROGRESS: at 55.94% examples, 177914 words/s, in_qsize 5, out_qsize 0
2018-05-03 19:01:28,664 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:01:28,692 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:01:28,701 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:01:28,702 : INFO : EPOCH - 79 : training on 410293 raw words (321640 effective words) took 1.6s, 206141 effective words/s
2018-05-03 19:01:29,786 : INFO : EPOCH 80 - PROGRESS: at 70.55% examples, 211293 words/s, in_qsize 5, out_qsize 0
2018-05-03 19:01:30,118 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:01:30,138 : INFO : worker threa

2018-05-03 19:01:50,209 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:01:50,236 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:01:50,236 : INFO : EPOCH - 94 : training on 410293 raw words (320714 effective words) took 1.4s, 234593 effective words/s
2018-05-03 19:01:51,249 : INFO : EPOCH 95 - PROGRESS: at 68.12% examples, 218201 words/s, in_qsize 6, out_qsize 0
2018-05-03 19:01:51,560 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-03 19:01:51,612 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-03 19:01:51,613 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-03 19:01:51,613 : INFO : EPOCH - 95 : training on 410293 raw words (321226 effective words) took 1.4s, 234829 effective words/s
2018-05-03 19:01:52,625 : INFO : EPOCH 96 - PROGRESS: at 75.41% examples, 241533 words/s, in_qsize 5, out_qsize 0
2018-05-03 19:01:52,867 : INFO : worker threa

CPU times: user 4min 24s, sys: 20.9 s, total: 4min 45s
Wall time: 2min 28s


In [34]:
def build_doc_role_entity_doc2vec_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_doc2vec_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        #print(doc)
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_doc2vec_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_doc2vec_list = list()
            for entity in entity_list:
                entity_doc2vec_list.append(model.infer_vector(entity))
            tag_doc2vec_dict[tag] = entity_doc2vec_list
        doc_role_entity_context_word_doc2vec_dict[doc] = tag_doc2vec_dict
        
    return doc_role_entity_context_word_doc2vec_dict

In [35]:
#train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_doc2vec_dict = build_doc_role_entity_doc2vec_dictionary(test_doc_role_entity_context_word_dict)

In [36]:
pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_doc2vec_dict.p', 'wb'))