# Load the model and corpus data

In [1]:
model_name = "fasttext_unsupervised_cbow_dim300_preprocessing"

In [None]:
# Load model vocabulary and embeddings 
import fasttext
import numpy as np
import gc

model = fasttext.load_model(f"models/{model_name}.bin")

vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

# Clean memory
del model
gc.collect()

In [3]:
# Read preprocessed docs texts from file
input_file_name = 'preprocessed_texts.txt'
with open(input_file_name, 'r', encoding='utf-8') as file:
    preprocessed_texts = file.readlines()

preprocessed_texts = [text.strip() for text in preprocessed_texts]

# Aggregate words of each document
Since both the documents and the query is of variable size, we should aggregate the vectors of the words in the query by some strategy. This could be taking the minimum vector, maximum vector or the mean.

In [None]:
# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

def aggregate_vector_list(vlist, aggfunc):
    if aggfunc == 'max':
        return np.array(vlist).max(axis=0)
    elif aggfunc == 'min':
        return np.array(vlist).min(axis=0)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

# possible_aggfuncs = ["max", "min", "mean"]
possible_aggfuncs = ["mean"]

aggregated_docs_vectors = {} # for each doc, the 3 possible aggregated vectors (min, max, mean)

# Aggregate vectors of documents
for aggfunc in possible_aggfuncs:
    aggregated_docs_vectors[aggfunc] = np.zeros((len(preprocessed_texts), word_embeddings.shape[1]))
    for index, doc in enumerate(preprocessed_texts):
        vlist = [vector_dict[token] for token in fasttext.tokenize(doc) if token in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_docs_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc)

del vocabulary
del word_embeddings
del vector_dict
del preprocessed_texts
gc.collect()

In [None]:
# Save the aggregated vectors calculated for each document on disk
import pickle

aggregated_docs_vectors_file = model_name

# Save aggregated_docs_vectors to disk
with open(f'aggregated_docs_vectors/adv_{aggregated_docs_vectors_file}.pkl', 'wb') as f:
    pickle.dump(aggregated_docs_vectors, f)
print(f"Saved aggregated_docs_vectors to aggregated_docs_vectors/adv_{aggregated_docs_vectors_file}.pkl")

# Clean memory
del aggregated_docs_vectors
gc.collect()