This implements:

1. Learn type representations from the corpus generated for types at Data/output/TypeRep/type_word/train/corpus_replaced_entity_with_tag

2. Generate and store vector representations for the tag in type_vector dictionary Data/output/TypeRep/type_word/train
3. Get the top n similar words to the type and store the tag in type_word dictionary at Data/output/TypeRep/type_word/train

In [None]:
import gensim, logging, os
from gensim.models import Word2Vec, Phrases, phrases, KeyedVectors
import scipy
import pickle


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
type_corpus_dir = '../../Data/output/TypeRep/type_word/train/corpus_replaced_entity_with_tag'
_tag_vect_dict = dict()
_tag_word_dict = dict()
_tag_list = ['LOC_Event', 'LOC_Accused', 'LOC_Victim', 'LOC_Others', 'ORG_Accused', 'ORG_Victim', 'ORG_Others', 'PER_Victim', 'PER_Others', 'PER_Accused']

In [None]:
class Corpus_Sentences(object):
    def __init__(self, filename):
        self.filename = filename;
        
    def __iter__(self):
        for line in open(self.filename):
            yield line.split()

In [None]:
glove_wv = KeyedVectors.load_word2vec_format('../../../word_embeddings/pretrained_word_embeddings/Word2Vec/Glove_Converted/glove_2_word2vec.6B.300d.txt', binary=False)

In [None]:
for _tag in _tag_list:
    
    print('Start Processing for ', _tag)
    sentences = Corpus_Sentences(os.path.join(type_corpus_dir, _tag)) # a memory-friendly iterator
    
    model = Word2Vec(size=300, min_count=1, iter=10)
    model.build_vocab(sentences)
    training_examples_count = model.corpus_count
    # below line will make it 1, so saving it before
    print('Building vocab')
    model.build_vocab([list(glove_wv.vocab.keys())], update=True)
    print('Updating layer weights')
    model.intersect_word2vec_format("../../../word_embeddings/pretrained_word_embeddings/Word2Vec/Glove_Converted/glove_2_word2vec.6B.300d.txt",binary=False, lockf=1.0)
    print('Training model')
    model.train(sentences, total_examples=training_examples_count, epochs=100)
    print('Saving model')
    model.wv.save_word2vec_format(os.path.join('../../trained_word_embeddings/word2vec/type_rep_pretrain_trained_on_corpus', _tag), binary=False)
    print('Getting similar words')
    similar_words = model.similar_by_word(_tag, topn=30)
    print('Updating similar words and vectors dictionary')
    _tag_word_dict[_tag] = similar_words    
    _tag_vect_dict[_tag] = model[_tag]
    
    print('End Processing for ', _tag)

In [None]:
print('Writing similar words and vectors dictionary')

pickle.dump(_tag_vect_dict, open( "../../Data/output/TypeRep/type_word/train/tag_vec_dict.p", "wb" ) )
pickle.dump(_tag_word_dict, open( "../../Data/output/TypeRep/type_word/train/tag_word_dict.p", "wb" ) )

print('Done Writing similar words and vectors dictionary')