# Word Embeddings

### Please run this after running tfidf, we need those values to weight the embeddings

In [2]:
import pandas as pd
import numpy as np
import pickle
#We need this line to find the collection_vocabulary.py here, else we cannot load the col.pkl object
import sys
sys.path.append('../0_Collection_and_Inverted_Index/')
with open('../0_Collection_and_Inverted_Index/pickle/col.pkl', 'rb') as input:
    col = pickle.load(input)
inverted_index = pd.read_pickle('../0_Collection_and_Inverted_Index/pickle/inverted_index.pkl')

tfidf=pd.read_pickle('pickle/tfidf.pkl')


Further information on our approach to derive word embeddings can be found in embeddings_experiments.ipynb in this very same folder. Here, we only cover the two models we finally used for feature generation.

It is important to understand the following first: There are basically two overlapping datasets provided by the authors. Firstly, 3633 docs represented as preprocessed BoWs, with each document being at least once (marginally) relevant for a given query. Secondly, 5731 raw docs, which are a superset of the 3633 BoW docs. This number is higher, since the authors crawled all these docs before categorizing them as (marginally) relevant. 

**Here, we use the 5731 raw docs to derive embeddings.**

We used Gensim's FastText model class to derive the embeddings.

We derived Word2Vec Embeddings with and without subword information, using the CBOW architecture.

The Word2Vec approach is described in this [paper](https://arxiv.org/abs/1607.04606) on which is also based the [Gensim implementation.](https://radimrehurek.com/gensim/models/fasttext.html)

We sticked to Gensim's default parameters, with the following being the most important ones to mention:
- embedding size: 100d dense vector
- window size: 5
- CBOW algorithm
- training with hierarchical softmax
- 5 iterations over the corpus

We changed the following parameters:
- minimum count of each word: 1
- Word2Vec with subword information: 
    - min n-gram-size: 3
    - max n-gram-size: 12

We also considered includings bigrams, but decided against since in the precomputed BoW representation no bigrams were considered, and this would result in a mismatch between the vocabulary derived from the raw texts and the BoW vocabulary.

In [7]:
# Preprocessing
# Gensim requires list of lists of Unicode 8 strings as an input. Since we have a small collection, 
# we are fine with loading everything into memory.
import re
doc_list= []
with open('../nfcorpus/raw/doc_dump.txt', 'r', encoding='utf-8') as rf1:
    for line in rf1:
        l = re.sub("MED-.*\t", "",line).lower().strip('\n').split()
        doc_list.append(l) 
len(doc_list)

5371

In [8]:
import gensim
gensim.models.fasttext.FAST_VERSION > -1 # make sure that you are using Cython backend

True

In [9]:
#Run this to create a fasttext model of our documents
#Name fasttest basically means Word2Vec with subword information
fasttext= gensim.models.FastText(doc_list, min_count= 1, min_n= 3, max_n=12)
fasttext.save('pickle/our_fasttext')

In [10]:
#Same as above, run this to compute the model, or run next cell to load it (if it exists on disk already)
word2vec= gensim.models.FastText(doc_list, min_count= 1, word_ngrams=0)
word2vec.save('pickle/our_fasttextword2vec')

### If you already ran the upper part, you can load the results here

In [11]:
# To save time, load the models, if they already exist.
# This loads the whole models (not only the vectors).
fasttext = gensim.models.FastText.load('pickle/our_fasttext') 
word2vec = gensim.models.FastText.load('pickle/our_fasttextword2vec')

In [12]:
# Word2Vec Embeddings with Subword Information, 100-d dense vector 
fasttext_embeddings_list=[]
words_not_covered_in_fasttext=[]
for word in inverted_index.index:
    try:
        fasttext_embeddings_list.append(fasttext.wv.get_vector(word))
    except:
        words_not_covered_in_fasttext.append(word)
        fasttext_embeddings_list.append(np.zeros(100)) # for those 3 OOV we insert an array consisting of zeros
fasttext_embeddings=pd.Series(fasttext_embeddings_list,index=inverted_index.index)
fasttext_embeddings.to_pickle('pickle/fasttext_embeddings.pkl')
fasttext_embeddings.head()

'hort    [0.44308105, -1.5351962, 0.956442, 0.268957, 0...
+        [-0.3428175, -0.34225887, 0.22346681, 0.418274...
-        [-0.32701638, -0.20407611, 0.23003644, 0.11978...
--a      [-0.06901108, -0.3414645, 0.15074101, 0.067805...
--all    [0.033670183, -1.1816841, -0.099532396, 0.1932...
dtype: object

In [13]:
#Word2Vec Embeddings, 100-d dense vector
word2vec_embeddings_list=[]
words_not_covered_in_word2vec=[]
for word in inverted_index.index:
    try:
        word2vec_embeddings_list.append(word2vec.wv.get_vector(word))
    except:
        words_not_covered_in_word2vec.append(word)
        word2vec_embeddings_list.append(np.zeros(100)) # for those 3 OOV we insert an array consisting of zeros
word2vec_embeddings=pd.Series(word2vec_embeddings_list,index=inverted_index.index)
word2vec_embeddings.to_pickle('pickle/word2vec_embeddings.pkl')
word2vec_embeddings.head()

'hort    [-1.6446841, -0.78930265, 0.58323836, -0.39907...
+        [-0.21700142, -0.18000382, -0.21318354, 0.6638...
-        [-0.3929198, -0.25487423, -0.43005493, 0.54094...
--a      [-0.3087517, -0.07765534, 0.098688155, -0.0462...
--all    [0.09711393, -0.5630381, -0.5637208, -0.174061...
dtype: object

### Another shortcut here

In [14]:
fasttext_embeddings = pd.read_pickle('pickle/fasttext_embeddings.pkl')
word2vec_embeddings = pd.read_pickle('pickle/word2vec_embeddings.pkl')

In [15]:
def get_weighted_embeddings(embeddings, tfidf_embed):
    sum_of_tfidf_weights=tfidf_embed.sum(axis=0)#vector containing the normalizing constant for each doc
    embeddings_dict={}
    # we have to make use of the following workaround to avoid memory errors
    # 1. calculate 100d embeddings vector for each doc/query and store it in dictionary
    # 2. recreate a a dataframe containg the embeddings for all docs/queries from the dictionary
    for doc in tfidf_embed.columns:
        if doc not in embeddings_dict.keys():
            embedding=(tfidf_embed[doc].mask(tfidf_embed[doc]!=0, other=(tfidf_embed[doc]*embeddings)).sum(axis=0))/sum_of_tfidf_weights[doc]
            embeddings_dict[doc]=embedding
        else:
            print('calculated embeddings successfully and stored them in dictionary')
    weighted_embedding = pd.DataFrame.from_dict(embeddings_dict)
    return weighted_embedding

In [16]:
documents_fasttext = get_weighted_embeddings(fasttext_embeddings, tfidf)

#Let's save those again, as computing them might take a while
documents_fasttext.to_pickle('pickle/documents_fasttext.pkl')

In [17]:
documents_word2vec= get_weighted_embeddings(word2vec_embeddings, tfidf)

#Save them as well
documents_word2vec.to_pickle('pickle/documents_word2vec.pkl')

In [18]:
#put this in report 
words_not_covered_in_word2vec

[':{', 'nw', 'rq', 'w']

In [19]:
#put this in report 
words_not_covered_in_fasttexttext

['nw', 'rq', 'w']