This implementation 

1. Reads the entity word and context word stored in doc_entity_context_word.p dictionary

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity list itself is a list of words


2. Train the Doc2Vec model Store the trained model
3. Generate Doc2Vec representation for Entities at Sentence Level
4. Generate Doc2Vec representation for Entities at Doc Level

Dictionary format

Element of Dictionary [ doc_id: dictonary of roles]

Key: Document names present in the corpus
Values: Dictionary of roles present in specific directory

Elements of Role Dictionary: 
Keys: Roles present in a specific doc
Value: List of entities having that role

Entity in the Entity list is 300-d vector which is doc2vec

In [1]:
import os, pickle
import gensim, logging
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors, doc2vec
import scipy, numpy


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [2]:
data_output_folder = '../../Data/output'
train_entity_rep_output_folder = 'EntityRep/train'
test_entity_rep_output_folder = 'EntityRep/test'
entity_context_word_file = 'doc_entity_context_word.p'
entity_doc_level_context_word_file = 'doc_entity_doc_Level_context_word.p'

doc_level = True
sent_level = False

word_emnbedding_pretrained_trained_on_corpus = '../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt'

In [3]:
# Load the pickle file

if sent_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_context_word_file), 'rb'))
    
if doc_level:
    train_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, train_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))
    test_doc_role_entity_context_word_dict = pickle.load(open(os.path.join(data_output_folder, test_entity_rep_output_folder, entity_doc_level_context_word_file), 'rb'))

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(word_emnbedding_pretrained_trained_on_corpus, binary=False)

2018-05-11 03:15:30,175 : INFO : loading projection weights from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt
2018-05-11 03:17:33,571 : INFO : loaded (408497, 300) matrix from ../../trained_word_embeddings/word2vec/word_pretrain_trained_on_corpus/w2v_pretain_corpus_trained_gensim_300.txt


In [5]:
def read_entities_build_corpus(doc_role_entity_context_word_dict, tokens_only=False):
    entities_corpus = list()
    
    for doc in doc_role_entity_context_word_dict.keys():
       
        tag_dict = doc_role_entity_context_word_dict[doc]
        
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            for entity in entity_list:
                if tokens_only:
                    yield(entity[1])
                else:
                    yield doc2vec.TaggedDocument(entity[1], [doc+'_'+entity[0]])
                    
                

In [6]:
train_corpus = list(read_entities_build_corpus(train_doc_role_entity_context_word_dict))
train_corpus[:2]

[TaggedDocument(words=[',', 'coming', 'on', 'a', 'bicycle', 'and', 'parking', 'it', 'opposite', 'the', 'Sai', 'Service', 'petrol', 'pump', '.', 'The', 'bomb', ',', 'which', 'was', 'placed', 'in', 'a', 'basket'], tags=['ev_088_st_003.txt_Sai_Service_petrol_pump']),
 TaggedDocument(words=['.', 'His', 'image', 'was', 'also', 'captured', 'on', 'CCTV', 'at', 'the', 'German', 'Bakery', 'on', 'February', '13', ',', '2010', '.', 'Sources', 'said', 'the', 'investigators'], tags=['ev_088_st_003.txt_German_Bakery'])]

In [7]:
test_corpus = list(read_entities_build_corpus(test_doc_role_entity_context_word_dict, tokens_only=True))
print(test_corpus[:2])

[['serial', 'blasts', 'were', 'the', 'handiwork', 'of', 'Indian', 'Mujahideen', 'to', 'avenge', 'Muzaffarnagar', 'riots', '?', 'That', "'s", 'what', 'the', 'security', 'agencies', 'want', 'us'], [',', 'scare', 'people', 'and', 'scuttle', 'the', 'rally', '.', 'Not', 'avenging', 'Muzaffarnagar', 'as', 'the', 'police', 'is', 'spinning', 'it.Bihar', ',', 'Patna', 'or', 'for']]


#### Train the model with pretrained word2vec

In [8]:
model = doc2vec.Doc2Vec(vector_size=300, min_count=1, epochs=100, pretrained_emb=word_emnbedding_pretrained_trained_on_corpus)
model.build_vocab(train_corpus)
#training_examples_count = model.corpus_count
#model.build_vocab([list(word_vectors.vocab.keys())], update=True)

#model.intersect_word2vec_format(word_emnbedding_pretrained_trained_on_corpus,binary=False, lockf=1.0)

2018-05-11 03:17:48,799 : INFO : collecting all words and their counts
2018-05-11 03:17:48,802 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2018-05-11 03:17:48,944 : INFO : PROGRESS: at example #10000, processed 211600 words (1510428/s), 10807 word types, 6998 tags
2018-05-11 03:17:49,038 : INFO : collected 15561 word types and 13383 unique tags from a corpus of 19366 examples and 410293 words
2018-05-11 03:17:49,040 : INFO : Loading a fresh vocabulary
2018-05-11 03:17:49,088 : INFO : min_count=1 retains 15561 unique words (100% of original 15561, drops 0)
2018-05-11 03:17:49,089 : INFO : min_count=1 leaves 410293 word corpus (100% of original 410293, drops 0)
2018-05-11 03:17:49,172 : INFO : deleting the raw counts dictionary of 15561 items
2018-05-11 03:17:49,174 : INFO : sample=0.001 downsamples 39 most-common words
2018-05-11 03:17:49,175 : INFO : downsampling leaves estimated 301757 word corpus (73.5% of prior 410293)
2018-05-11 03:17:49,251 : IN

In [9]:
%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2018-05-11 03:18:21,676 : INFO : training model with 3 workers on 15561 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2018-05-11 03:18:22,841 : INFO : EPOCH 1 - PROGRESS: at 70.67% examples, 224963 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:18:23,153 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:18:23,179 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:18:23,205 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:18:23,206 : INFO : EPOCH - 1 : training on 410293 raw words (321456 effective words) took 1.4s, 233987 effective words/s
2018-05-11 03:18:24,226 : INFO : EPOCH 2 - PROGRESS: at 75.56% examples, 239926 words/s, in_qsize 5, out_qsize 0
2018-05-11 03:18:24,507 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:18:24,512 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:18:24,541 : INFO : w

2018-05-11 03:18:43,517 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:18:43,518 : INFO : EPOCH - 16 : training on 410293 raw words (321085 effective words) took 1.3s, 241966 effective words/s
2018-05-11 03:18:44,528 : INFO : EPOCH 17 - PROGRESS: at 73.11% examples, 233824 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:18:44,804 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:18:44,842 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:18:44,860 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:18:44,861 : INFO : EPOCH - 17 : training on 410293 raw words (320837 effective words) took 1.3s, 240358 effective words/s
2018-05-11 03:18:45,940 : INFO : EPOCH 18 - PROGRESS: at 77.98% examples, 233286 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:18:46,156 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:18:46,163 : INFO : worker threa

2018-05-11 03:19:04,557 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:04,575 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:04,576 : INFO : EPOCH - 32 : training on 410293 raw words (321221 effective words) took 1.3s, 247949 effective words/s
2018-05-11 03:19:05,630 : INFO : EPOCH 33 - PROGRESS: at 70.67% examples, 216974 words/s, in_qsize 5, out_qsize 0
2018-05-11 03:19:05,969 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:19:05,982 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:05,995 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:05,995 : INFO : EPOCH - 33 : training on 410293 raw words (321359 effective words) took 1.4s, 227633 effective words/s
2018-05-11 03:19:07,029 : INFO : EPOCH 34 - PROGRESS: at 73.11% examples, 228497 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:19:07,325 : INFO : worker threa

2018-05-11 03:19:26,338 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:19:26,368 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:26,387 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:26,388 : INFO : EPOCH - 48 : training on 410293 raw words (321042 effective words) took 1.3s, 240502 effective words/s
2018-05-11 03:19:27,408 : INFO : EPOCH 49 - PROGRESS: at 75.56% examples, 239428 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:19:27,641 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:19:27,679 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:27,685 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:27,686 : INFO : EPOCH - 49 : training on 410293 raw words (321053 effective words) took 1.3s, 249114 effective words/s
2018-05-11 03:19:28,719 : INFO : EPOCH 50 - PROGRESS: at 75.55% exam

2018-05-11 03:19:47,071 : INFO : EPOCH 64 - PROGRESS: at 75.56% examples, 241459 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:19:47,319 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:19:47,348 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:47,360 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:47,361 : INFO : EPOCH - 64 : training on 410293 raw words (320941 effective words) took 1.3s, 248379 effective words/s
2018-05-11 03:19:48,376 : INFO : EPOCH 65 - PROGRESS: at 75.56% examples, 240380 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:19:48,626 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:19:48,655 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:19:48,663 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:19:48,664 : INFO : EPOCH - 65 : training on 410293 raw words (321021 effecti

2018-05-11 03:20:07,088 : INFO : EPOCH - 79 : training on 410293 raw words (321182 effective words) took 1.3s, 247099 effective words/s
2018-05-11 03:20:08,115 : INFO : EPOCH 80 - PROGRESS: at 75.55% examples, 237870 words/s, in_qsize 5, out_qsize 0
2018-05-11 03:20:08,359 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:20:08,377 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:20:08,403 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:20:08,404 : INFO : EPOCH - 80 : training on 410293 raw words (321126 effective words) took 1.3s, 245529 effective words/s
2018-05-11 03:20:09,413 : INFO : EPOCH 81 - PROGRESS: at 75.56% examples, 242095 words/s, in_qsize 5, out_qsize 0
2018-05-11 03:20:09,675 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:20:09,688 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:20:09,730 : INFO : worker threa

2018-05-11 03:20:28,124 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:20:28,126 : INFO : EPOCH - 95 : training on 410293 raw words (321072 effective words) took 1.3s, 239615 effective words/s
2018-05-11 03:20:29,144 : INFO : EPOCH 96 - PROGRESS: at 75.56% examples, 239991 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:20:29,393 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:20:29,432 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-05-11 03:20:29,449 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-05-11 03:20:29,450 : INFO : EPOCH - 96 : training on 410293 raw words (321115 effective words) took 1.3s, 244047 effective words/s
2018-05-11 03:20:30,514 : INFO : EPOCH 97 - PROGRESS: at 75.55% examples, 229381 words/s, in_qsize 6, out_qsize 0
2018-05-11 03:20:30,738 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-05-11 03:20:30,772 : INFO : worker threa

CPU times: user 4min 3s, sys: 21.2 s, total: 4min 24s
Wall time: 2min 13s


In [10]:
def build_doc_role_entity_doc2vec_dictionary(doc_role_entity_context_word_dict):
    
    doc_role_entity_context_word_doc2vec_dict = dict()
    
    for doc in doc_role_entity_context_word_dict.keys():
        #print(doc)
        tag_dict = doc_role_entity_context_word_dict[doc]
        tag_doc2vec_dict = dict()
        for tag in tag_dict.keys():
            entity_list = tag_dict[tag]
            entity_doc2vec_list = list()
            for entity in entity_list:
                entity_doc2vec_list.append((entity[0], model.infer_vector(entity[1])))
            tag_doc2vec_dict[tag] = entity_doc2vec_list
        doc_role_entity_context_word_doc2vec_dict[doc] = tag_doc2vec_dict
        
    return doc_role_entity_context_word_doc2vec_dict

In [11]:
#train_doc_role_entity_context_word_centroid_dict = build_doc_role_entity_centroid_dictionary(train_doc_role_entity_context_word_dict)
test_doc_role_entity_context_word_doc2vec_dict = build_doc_role_entity_doc2vec_dictionary(test_doc_role_entity_context_word_dict)

In [12]:
if sent_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_context_word_doc2vec_dict.p', 'wb'))

if doc_level:
    pickle.dump(test_doc_role_entity_context_word_doc2vec_dict, open('../../Data/output/EntityRep/test/doc_role_entity_doc_level_context_word_doc2vec_dict.p', 'wb'))