# Embeddings

### Please run this after running tfidf, we need those values to weight the embeddings

In [10]:
import pandas as pd
import numpy as np
tfidf=pd.read_pickle('pickle/tfidf.pkl')
inverted_index = pd.read_pickle('../0_Collection_and_Inverted_Index/pickle/inverted_index.pkl')

Further information to the embeddings can be found in Word Embeddings Experiments.ipynb, here we only use the parts for feature generation.

We use two versions of embeddings here, fasttext and fasttext.word2vec

In [3]:
# Preprocessing
# Gensim requires list of lists of Unicode 8 strings as an input. Since we have a small collection, 
# we are fine with loading everything into memory.
import re
doc_list= []
with open('../nfcorpus/raw/doc_dump.txt', 'r', encoding='utf-8') as rf1:
    for line in rf1:
        l = re.sub("MED-.*\t", "",line).lower().strip('\n').split()
        doc_list.append(l) 
len(doc_list)

5371

In [4]:
import gensim
gensim.models.fasttext.FAST_VERSION > -1 # make sure that you are using Cython backend



True

In [5]:
#Run this to create a fasttext model of our documents
#fasttext= gensim.models.FastText(bigram[doc_list], min_count= 1, min_n= 3, max_n=12)
fasttext= gensim.models.FastText(doc_list, min_count= 1, min_n= 3, max_n=12)
fasttext.save('pickle/our_fasttext')

In [6]:
#Same as above, run this to compute the model, or run next cell to load it (if it exists on disk already)
word2vec= gensim.models.FastText(doc_list, min_count= 1, word_ngrams=0)
word2vec.save('pickle/our_fasttextword2vec')

### If you already ran the upper part, you can load the results here

In [None]:
#To save time, load the models, if they already exist
# this loads the whole model, (not only the vectors)
fasttext = gensim.models.FastText.load('pickle/our_fasttext')
word2vec = gensim.models.FastText.load('pickle/our_fasttextword2vec')

In [11]:
#fasttext Embeddings, 100-d dense vector
fasttext_embeddings_list=[]
words_not_covered_in_fasttext=[]
for word in inverted_index.index:
    try:
        fasttext_embeddings_list.append(fasttext.wv.get_vector(word))
    except:
        words_not_covered_in_fasttext.append(word)
        fasttext_embeddings_list.append(np.zeros(100)) # for those 3 OOV we insert an array consisting of zeros
fasttext_embeddings=pd.Series(fasttext_embeddings_list,index=inverted_index.index)
fasttext_embeddings.to_pickle('pickle/fasttext_embeddings.pkl')
fasttext_embeddings.head()

'hort    [0.420471, -1.27513, 1.113, -0.238618, 0.47387...
+        [-0.230507, -0.354534, 0.174181, 0.363291, 0.5...
-        [-0.346358, -0.279879, 0.261858, 0.355061, 0.4...
--a      [-0.0193924, -0.354634, 0.208689, 0.0206046, 0...
--all    [0.458753, -0.815639, -0.0254263, 0.274309, 0....
dtype: object

In [12]:
#Word2Vec Embeddings, 100-d dense vector
word2vec_embeddings_list=[]
words_not_covered_in_word2vec=[]
for word in inverted_index.index:
    try:
        word2vec_embeddings_list.append(word2vec.wv.get_vector(word))
    except:
        words_not_covered_in_word2vec.append(word)
        word2vec_embeddings_list.append(np.zeros(100)) # for those 3 OOV we insert an array consisting of zeros
word2vec_embeddings=pd.Series(word2vec_embeddings_list,index=inverted_index.index)
word2vec_embeddings.to_pickle('pickle/word2vec_embeddings.pkl')
word2vec_embeddings.head()

'hort    [-2.15808, -0.929237, 0.76162, 0.0194782, -0.8...
+        [-0.319704, -0.0608469, -0.312795, 0.625708, -...
-        [-0.374173, -0.235332, -0.46186, 0.581642, -0....
--a      [-0.252248, -0.0543773, 0.108192, 0.0346592, -...
--all    [-0.0988697, -0.412766, -0.526214, -0.416635, ...
dtype: object

### Another shortcut here

In [None]:
fasttext_embeddings = pd.read_pickle('pickle/fasttext_embeddings.pkl')
word2vec_embeddings = pd.read_pickle('pickle/word2vec_embeddings.pkl')

In [15]:
def get_weighted_embeddings(embeddings, tfidf_embed):
    sum_of_tfidf_weights=tfidf_embed.sum(axis=0)#vector containing the normalizing constant for each doc
    embeddings_dict={}
    # we have to make use of the following workaround to avoid memory errors
    # 1. calculate 100d embeddings vector for each doc/query and store it in dictionary
    # 2. recreate a a dataframe containg the embeddings for all docs/queries from the dictionary
    for doc in tfidf_embed.columns:
        if doc not in embeddings_dict.keys():
            embedding=(tfidf_embed[doc].mask(tfidf_embed[doc]!=0, other=(tfidf_embed[doc]*embeddings)).sum(axis=0))/sum_of_tfidf_weights[doc]
            embeddings_dict[doc]=embedding
        else:
            print('calculated embeddings successfully and stored them in dictionary')
    weighted_embedding = pd.DataFrame.from_dict(embeddings_dict)
    return weighted_embedding

In [16]:
documents_fasttext = get_weighted_embeddings(fasttext_embeddings, tfidf)

#Let's save those again, as computing them might take a while
documents_fasttext.to_pickle('pickle/documents_fasttext.pkl')

In [17]:
documents_word2vec= get_weighted_embeddings(word2vec_embeddings, tfidf)

#Save them as well
documents_word2vec.to_pickle('pickle/documents_word2vec.pkl')