This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Train the Doc2Vec model Store the trained model
3. Generate Doc2Vec representation for Entities at Sentence Level
4. Generate Doc2Vec representation for Entities at Doc Level

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is doc2vec

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors, doc2vec
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'EntityRep/train'
test_entity_rep_output_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'
entity_doc_level_context_word_file = 'doc_entity_doc_Level_context_word.p'

doc_level = False
sent_level = True

word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [3]:
# Load the pickle file

if sent_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))
    
if doc_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-19 00:23:23,100 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-19 00:25:50,842 : INFO : loaded (408422, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [5]:
def read_entities_build_corpus(doc_role_entity_context_word_dict, tokens_only=False):
    entities_corpus = list()
    
    for doc in doc_role_entity_context_word_dict.keys():
       
        tag_dict = doc_role_entity_context_word_dict[doc]
        
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            for entity in entity_list:
                if tokens_only:
                    yield(entity[1])
                else:
                    yield doc2vec.TaggedDocument(entity[1], [doc+'_'+entity[0]])
                    
                

In [6]:
train_corpus = list(read_entities_build_corpus(train_doc_role_entity_context_word_dict))
train_corpus[:2]

[TaggedDocument(words=['1993', 'Delhi', 'blast', 'convict', 'Devinderpal', 'Singh', 'Bhullar', "'s", 'wife', 'on', 'Tuesday', 'approached', 'the', 'Supreme', 'Court', 'seeking', 'stay'], tags=['2013_5_8_st-491.txt_Devinderpal_Singh_Bhullar']),
 TaggedDocument(words=['is', 'decided', '.', 'Khalistan', 'Liberation', 'Force', '(', 'KLF', ')', 'terrorist', 'Bhullar', 'was', 'convicted', 'and', 'awarded', 'death', 'penalty', 'for', 'triggering', 'a', 'bomb'], tags=['2013_5_8_st-491.txt_Bhullar'])]

In [7]:
test_corpus = list(read_entities_build_corpus(test_doc_role_entity_context_word_dict, tokens_only=True))
print(test_corpus[:2])

[['leading', 'to', 'the', 'mall', '.', 'Traffic', 'has', 'been', 'diverted', 'from', 'Hosur', 'Road', 'and', 'Koramangla.Following', 'the', 'bomb', 'scare', ',', 'the', 'police', 'commissioner', 'of'], ['and', 'Koramangla.Following', 'the', 'bomb', 'scare', ',', 'the', 'police', 'commissioner', 'of', 'Bangalore', ',', 'Shankar', 'Bidri', 'also', 'reached', 'the', 'area', 'and', 'requested', 'the']]


#### Train the model with pretrained word2vec

In [8]:
model = doc2vec.Doc2Vec(vector_size=300, min_count=1, epochs=100, pretrained_emb=word_emnbedding_pretrained_trained_on_corpus)
model.build_vocab(train_corpus)
#training_examples_count = model.corpus_count
#model.build_vocab([list(word_vectors.vocab.keys())], update=True)

#model.intersect_word2vec_format(word_emnbedding_pretrained_trained_on_corpus,binary=False, lockf=1.0)

2018-05-19 00:25:51,894 : INFO : collecting all words and their counts
2018-05-19 00:25:51,895 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-19 00:25:52,018 : INFO : PROGRESS: at example #10000, processed 211893 words (1823259/s), 10621 word types, 6921 tags
2018-05-19 00:25:52,124 : INFO : PROGRESS: at example #20000, processed 424236 words (2025832/s), 15588 word types, 13600 tags
2018-05-19 00:25:52,125 : INFO : collected 15609 word types and 13649 unique tags from a corpus of 20057 examples and 425448 words
2018-05-19 00:25:52,126 : INFO : Loading a fresh vocabulary
2018-05-19 00:25:52,203 : INFO : min_count=1 retains 15609 unique words (100% of original 15609, drops 0)
2018-05-19 00:25:52,204 : INFO : min_count=1 leaves 425448 word corpus (100% of original 425448, drops 0)
2018-05-19 00:25:52,325 : INFO : deleting the raw counts dictionary of 15609 items
2018-05-19 00:25:52,326 : INFO : sample=0.001 downsamples 37 most-common words
2018-05

In [9]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2018-05-19 00:25:53,222 : INFO : training model with 3 workers on 15609 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-05-19 00:25:54,521 : INFO : EPOCH 1 - PROGRESS: at 61.13% examples, 197721 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:25:55,025 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:25:55,030 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:25:55,035 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:25:55,036 : INFO : EPOCH - 1 : training on 425448 raw words (333472 effective words) took 1.5s, 216011 effective words/s
2018-05-19 00:25:56,059 : INFO : EPOCH 2 - PROGRESS: at 68.13% examples, 225078 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:25:56,462 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:25:56,477 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:25:56,499 : INFO : w

2018-05-19 00:26:17,657 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:26:17,658 : INFO : EPOCH - 16 : training on 425448 raw words (333628 effective words) took 1.6s, 205500 effective words/s
2018-05-19 00:26:18,691 : INFO : EPOCH 17 - PROGRESS: at 61.11% examples, 198999 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:26:19,398 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:26:19,418 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:26:19,427 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:26:19,428 : INFO : EPOCH - 17 : training on 425448 raw words (333175 effective words) took 1.8s, 189703 effective words/s
2018-05-19 00:26:20,458 : INFO : EPOCH 18 - PROGRESS: at 63.47% examples, 207227 words/s, in_qsize 6, out_qsize 0
2018-05-19 00:26:20,928 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:26:20,958 : INFO : worker threa

2018-05-19 00:26:45,234 : INFO : EPOCH 32 - PROGRESS: at 58.77% examples, 180290 words/s, in_qsize 6, out_qsize 0
2018-05-19 00:26:45,849 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:26:45,869 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:26:45,891 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:26:45,892 : INFO : EPOCH - 32 : training on 425448 raw words (333730 effective words) took 1.7s, 191624 effective words/s
2018-05-19 00:26:46,998 : INFO : EPOCH 33 - PROGRESS: at 58.77% examples, 178166 words/s, in_qsize 6, out_qsize 1
2018-05-19 00:26:47,531 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:26:47,543 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:26:47,567 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:26:47,568 : INFO : EPOCH - 33 : training on 425448 raw words (333486 effecti

2018-05-19 00:27:09,561 : INFO : EPOCH - 47 : training on 425448 raw words (333597 effective words) took 1.6s, 203966 effective words/s
2018-05-19 00:27:10,602 : INFO : EPOCH 48 - PROGRESS: at 58.77% examples, 189181 words/s, in_qsize 6, out_qsize 0
2018-05-19 00:27:11,279 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:27:11,281 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:27:11,283 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:27:11,284 : INFO : EPOCH - 48 : training on 425448 raw words (333461 effective words) took 1.7s, 194373 effective words/s
2018-05-19 00:27:12,302 : INFO : EPOCH 49 - PROGRESS: at 51.72% examples, 170003 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:27:12,974 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:27:12,980 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:27:12,995 : INFO : worker threa

2018-05-19 00:27:36,712 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:27:36,715 : INFO : EPOCH - 63 : training on 425448 raw words (333657 effective words) took 1.5s, 217979 effective words/s
2018-05-19 00:27:37,759 : INFO : EPOCH 64 - PROGRESS: at 63.47% examples, 203660 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:27:38,239 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:27:38,270 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:27:38,277 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:27:38,278 : INFO : EPOCH - 64 : training on 425448 raw words (333462 effective words) took 1.6s, 214522 effective words/s
2018-05-19 00:27:39,342 : INFO : EPOCH 65 - PROGRESS: at 65.82% examples, 207650 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:27:39,781 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:27:39,792 : INFO : worker threa

2018-05-19 00:28:03,821 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:28:03,825 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:28:03,828 : INFO : EPOCH - 79 : training on 425448 raw words (333637 effective words) took 1.6s, 206852 effective words/s
2018-05-19 00:28:04,907 : INFO : EPOCH 80 - PROGRESS: at 65.80% examples, 205469 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:28:05,437 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:28:05,440 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:28:05,470 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:28:05,471 : INFO : EPOCH - 80 : training on 425448 raw words (333606 effective words) took 1.6s, 204578 effective words/s
2018-05-19 00:28:06,484 : INFO : EPOCH 81 - PROGRESS: at 61.11% examples, 202248 words/s, in_qsize 6, out_qsize 0
2018-05-19 00:28:07,050 : INFO : worker threa

2018-05-19 00:28:28,451 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:28:28,454 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:28:28,480 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:28:28,480 : INFO : EPOCH - 95 : training on 425448 raw words (333384 effective words) took 1.4s, 230848 effective words/s
2018-05-19 00:28:29,492 : INFO : EPOCH 96 - PROGRESS: at 70.47% examples, 233560 words/s, in_qsize 5, out_qsize 0
2018-05-19 00:28:29,881 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-19 00:28:29,892 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-19 00:28:29,914 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-19 00:28:29,915 : INFO : EPOCH - 96 : training on 425448 raw words (333824 effective words) took 1.4s, 233697 effective words/s
2018-05-19 00:28:30,936 : INFO : EPOCH 97 - PROGRESS: at 70.47% exam

CPU times: user 4min 42s, sys: 22.9 s, total: 5min 5s
Wall time: 2min 42s


In [10]:
def build_doc_role_entity_doc2vec_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_doc2vec_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        #print(doc)
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_doc2vec_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_doc2vec_list = list()
            for entity in entity_list:
                entity_doc2vec_list.append((entity[0], model.infer_vector(entity[1])))
            tag_doc2vec_dict[tag] = entity_doc2vec_list
        doc_role_entity_context_word_doc2vec_dict[doc] = tag_doc2vec_dict
        
    return doc_role_entity_context_word_doc2vec_dict

In [11]:
#train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_doc2vec_dict = build_doc_role_entity_doc2vec_dictionary(test_doc_role_entity_context_word_dict)

In [12]:
if sent_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_doc2vec_dict.p', 'wb'))

if doc_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_doc_level_context_word_doc2vec_dict.p', 'wb'))