In [16]:
import pickle
import nltk

import gensim

In [2]:
import os
#os.chdir("prepared_corpora/opinosis/")
os.chdir("prepared_corpora/msrp/")


In [3]:
phrases = None
with open("phrases.txt", "r") as phrases_fh:
    phrases = list(phrases_fh)

#tokenised_phrases = nltk.tokenize.SpaceTokenizer().tokenize_sents(map(str.lower, phrases))\
tokenised_phrases = nltk.tokenize.TreebankWordTokenizer().tokenize_sents(map(str.lower, phrases))

with open("tokenized_phrases.txt","w") as fh:
    for phrase in tokenised_phrases:
        fh.write(" ".join(phrase))
        fh.write("\n")
    

In [4]:
def infer_and_save(model,tokenised_phrases, savename):
    with open(savename, "w") as outvectors_fh:
        for tockenised_line in tokenised_phrases:
            vector = model.infer_vector(tockenised_line)
            outvectors_fh.write(str(vector.tolist())[1:-1])
            outvectors_fh.write("\n")

In [17]:
model = gensim.models.doc2vec.Doc2Vec.load("../../models/wiki_sentence_model_concat_model")
infer_and_save(model, tokenised_phrases,"outVectors_wiki_sentence_concat_pvdm.csv")

In [26]:
model.docvecs.count

1209507

In [6]:
model = gensim.models.doc2vec.Doc2Vec.load("../../models/wiki_sentence_model_dbow_model")
infer_and_save(model, tokenised_phrases,"outVectors_wiki_sentence_dbow.csv")

In [7]:
import numpy as np
import gensim.corpora.dictionary as gensim_bow


class BowModel(object):
    def __init__(self, tokenised_phrases):
        self.bow_dict = gensim_bow.Dictionary(tokenised_phrases)
        
    def _unsparse(self, bow_kv_list):
        ret = np.zeros(len(self.bow_dict))
        for key, value in bow_kv_list:
            ret[key]=value
        return ret
    
    def infer_vector(self,tokenised_phrase):
        bow = self.bow_dict.doc2bow(tokenised_phrase)
        return self._unsparse(bow)
    
        

In [8]:
bow_model = BowModel(tokenised_phrases)
infer_and_save(bow_model, tokenised_phrases,"outVectors_bow.csv")

#Word Embeeding Models

In [10]:
import codecs
import io

def read_until(until_char,stream):
    """Consumes up to and including the next occurance of the until_char. Raises EOFError if reached end of file without finign the untilchar"""
    ret = b''
    while(True):
        read_char=stream.read(1)
        if len(read_char)==0:
            raise EOFError()
        elif read_char==until_char:
            return ret
        else:
            ret+=read_char
        
        
def load_word2vec_embeddings(embedding_file, words_to_keep=set()):
    """
    words_to_keep: if this is a nonempty set then only those words will be kept. if it is empty (or not given) then all words will be kept
    """
    
    word_vectors = dict()
    
    with io.open(embedding_file,"rb") as fh:
        #return fh
        vocab_size, vector_size = [int(s) for s in fh.readline().split()]
        for ii in range(vocab_size):
            word = read_until(b' ', fh)
            word = word.decode(encoding='iso-8859-1')
            
            #print(word)
            encoded_embedding = fh.read(4*vector_size)#4 bytes for Float32
            
            if len(words_to_keep)==0 or word in words_to_keep:
                word_vectors[word] = np.fromstring(encoded_embedding, dtype=np.float32, count=vector_size)    
    
    return word_vectors
                
            
        
        
        

In [11]:
word_vectors = load_word2vec_embeddings("../../../../../Resources/example_code/word2vec/GoogleNews-vectors-negative300.bin", bow_model.bow_dict.token2id.keys())

In [12]:
class SumOfWordEmbeddingsModel(object):
    def __init__(self, tokenised_phrases, word_embeddings, mean=False):
        self.word_vectors = word_embeddings
        self.mean=mean

    
    def infer_vector(self,tokenised_phrase):
        n_words = 0
        sum_of_word_embeddings = np.zeros(300)
        for word in tokenised_phrase:
            if word in self.word_vectors:
                n_words+=1
                sum_of_word_embeddings+=self.word_vectors[word]
        if self.mean:
            return sum_of_word_embeddings/n_words
        else:
            return sum_of_word_embeddings


In [13]:
model = SumOfWordEmbeddingsModel(tokenised_phrases, word_vectors,mean=True)
infer_and_save(model, tokenised_phrases,"outVectors_mowe.csv")

In [14]:
model = SumOfWordEmbeddingsModel(tokenised_phrases, word_vectors,mean=False)
infer_and_save(model, tokenised_phrases,"outVectors_sowe.csv")

In [15]:
nonfound = set(bow_model.bow_dict.token2id.keys()).difference(model.word_vectors.keys())
print(len(nonfound))
nonfound

546


{'!',
 "'",
 "''",
 "''fewer",
 "''i",
 "''is",
 "'s",
 '(',
 ')',
 ',',
 '-',
 '--',
 '.',
 '...',
 '.20',
 '.85',
 '.dji',
 '.ixic',
 '.spx',
 '0.02',
 '0.09',
 '0.10',
 '0.15',
 '0.17',
 '0.18',
 '0.2',
 '0.20',
 '0.21',
 '0.3',
 '0.34',
 '0.41',
 '0.42',
 '0.48',
 '0.5',
 '0.52',
 '0.6',
 '0.60',
 '0.7',
 '0.71',
 '0.9',
 '0.96',
 '0.98',
 '1,100',
 '1,200',
 '1,366',
 '1,600,000',
 '1,912.65',
 '1,917.67',
 '1,921.33',
 '1.02',
 '1.15',
 '1.2',
 '1.25',
 '1.36',
 '1.4',
 '1.47',
 '1.6',
 '1.63',
 '1/2',
 '10',
 '10.89',
 '100',
 '107.6',
 '10:33',
 '11',
 '11,000',
 '11,400',
 '11.14',
 '11.64',
 '11.92',
 '114',
 '114.3',
 '117kg',
 '118',
 '11th',
 '12',
 '12,400',
 '12-by-18-inch',
 '12-inch-by-18-inch',
 '12-month',
 '12-nation',
 '120,000',
 '121.51',
 '128-bit',
 '12:01',
 '13',
 '130',
 '14',
 '14-member',
 '14-strong',
 '14.7',
 '140',
 '143-year',
 '15',
 '16',
 '17',
 '17-year-old',
 '17-year-olds',
 '18',
 '18.06',
 '18.34',
 '1975',
 '1980s',
 '1989',
 '1994',
 '1996',

In [None]:
class HistogramOfWordEmbeedingsModel(object):
    def __init__(self, tokenised_phrases, word_embeddings, n_bins = 10):
        self.word_vectors = word_embeddings
        min_value = np.row_stack(word_vectors.values()).min()
        max_value = np.row_stack(word_vectors.values()).max()
        self.bins = np.linspace(min_value,max_value,n_bins)
        
    
    def infer_vector(self,tokenised_phrase):
        word_embeddings = np.row_stack([self.word_vectors[word] for word in tokenised_phrase if word in self.word_vectors])
        def get_hists(col_slice):
            return np.histogram(col_slice, bins=self.bins)[0]
            
        return np.apply_along_axis(get_hists,0,word_embeddings).flatten()


In [None]:
model = HistogramOfWordEmbeedingsModel(tokenised_phrases, word_vectors)
infer_and_save(model, tokenised_phrases,"outVectors_howe.csv")

In [None]:
len(tokenised_phrases)