### 2.1. Generate Embeddings

In [None]:
import numpy as np 
import fasttext
import json
import tempfile
import os
import gc
import psutil
from multiprocessing import cpu_count

# Function to print memory usage
def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"Memory Usage: {mem_info.rss / 1024 ** 2:.2f} MB")


# Load corpus json
print_memory_usage()
print('Load corpus.json')
with open('actual_data/corpus.json/corpus.json', 'r') as f:
    documents = json.load(f)
print_memory_usage()

# Extract text from docs
print('Extract text from docs')
texts = [doc['text'] for doc in documents]
print_memory_usage()


del documents
gc.collect()
print_memory_usage()

# Save docs in temp file
print('Save docs in temp file')
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file_name = temp_file.name
    for text in texts:
        temp_file.write((text + '\n').encode('utf-8'))
print_memory_usage()

print('Delete texts variable')
del texts
gc.collect()
print_memory_usage()

print('Start fasttext model training')
model = fasttext.train_unsupervised(temp_file_name, model = 'cbow', thread=cpu_count())
print_memory_usage()

# Close and remove the temporary file
temp_file.close()
os.remove(temp_file_name)

# Save the trained model
print('Save the trained model')
model.save_model("model_word_embeddings_fasttext.bin")
print_memory_usage()

print('Delete model variable')
del model
gc.collect()
print_memory_usage()

In [None]:
import fasttext
import numpy as np
import gc

model = fasttext.load_model("model_filename.bin")

vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

# Clean memory
del model
gc.collect()




### 3. Visualize Embeddings

In the third phase of this exercise, we will visualize the generated embeddings using t-SNE (T-Distributed Stochastic Neighbouring Entities).

t-SNE is a dimensionality reduction algorithm which is well suited for such visualization tasks.


In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
%matplotlib inline

tsne = TSNE(n_components=2, verbose=1, perplexity=50, n_iter=1000, init = 'pca') 
vis_data = tsne.fit_transform(word_embeddings)

# Clean memory
del tsne
gc.collect()

In [None]:
vis_data_x = vis_data[:,0]
vis_data_y = vis_data[:,1]

plt.rcParams.update({'font.size': 8})
plt.figure(figsize=(40, 40)) 
plt.scatter(vis_data_x, vis_data_y)

for label, x, y in zip(vocabulary, vis_data_x, vis_data_y):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
plt.show()

# Clean memory
del vis_data
del vis_data_x
del vis_data_y
gc.collect()

### Questions:

1. Observe the plot of word embeddings. Do you observe any patterns?

2. Write a python function to find the most similar terms for a given term. The similarity between two terms is defined as the cosine similarity between their corresponding word embeddings. Find the top 5 terms that are most similar to 'la', 'EPFL', '#robot', 'this'


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(input_term, word_embeddings, vocabulary, num_terms=5):
    # Create dict to associate embedding to each term in vocabulary
    term_embeddings_dict = {} 
    for i,term in enumerate(vocabulary):
        term_embeddings_dict[term] = word_embeddings[i]
    
    # Find input_term in embeddings dict
    if input_term not in term_embeddings_dict:
        return "Term not in the vocabulary"
    input_term_embedding = term_embeddings_dict[input_term]

    # Calculate similarity with each term in vocabulary
    term_similarities = []
    for term, embedding in term_embeddings_dict.items():
        term_similarities.append([term, cosine_similarity(input_term_embedding.reshape((1,-1)), embedding.reshape((1,-1)))]) # reshape embedding into 2D array with 1 line as expected by cosine_similarity function
        
    sorted_terms = sorted(term_similarities, key = lambda x: -1 * x[1])[0:num_terms] # sort by decreasing similarity score, select num_terms first elements

    return sorted_terms
    

find_most_similar('Canadian', word_embeddings, vocabulary, num_terms=5)

## 📚 Exercise 2: Basic Search Engine Using Word Embeddings

In this exercise, we would put our word embeddings to test by using them for information retrieval. 
The idea is that, the documents that have the most similar embedding vectors to the one belongs to query should rank higher.
The documents may not necessarily include the keywords in the query.


### Goal:
1. Implement a search engine that uses word embeddings to retrieve relevant documents (Data file: `epfldocs.txt`)
2. Compare the results with vector space retrieval model


### What you are learning in this exercise:
- Learning to use word embeddings for a search engine 



### 1. Load the data

In [None]:
# Loading of libraries and documents
import json

# Read a list of documents from a file. Each line in a file is a document
with open("actual_data/corpus.json/corpus.json", ) as f:
    documents = json.load(f)
        
original_documents = [doc['text'].strip() for doc in documents]
docids = [doc['docid'] for doc in documents]

# To match document index and corresponding docid
doc_index_to_docid = {index: doc_id for index, doc_id in enumerate(docids)}

# Clean memory
del documents
del docids
gc.collect()

### 2. Aggregate words of each document
Since both the documents and the query is of variable size, we should aggregate the vectors of the words in the query by some strategy. This could be taking the minimum vector, maximum vector or the mean. Fill in the code below.

In [None]:
# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

def aggregate_vector_list(vlist, aggfunc):
    if aggfunc == 'max':
        return np.array(vlist).max(axis=0)
    elif aggfunc == 'min':
        return np.array(vlist).min(axis=0)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

possible_aggfuncs = ["max", "min", "mean"]

aggregated_doc_vectors = {} # for each doc, the 3 possible aggregated vectors (min, max, mean)

# Aggregate vectors of documents beforehand
# TODO
for aggfunc in possible_aggfuncs:
    aggregated_doc_vectors[aggfunc] = np.zeros((len(original_documents), word_embeddings.shape[1]))
    for index, doc in enumerate(original_documents):
        vlist = [vector_dict[token] for token in fasttext.tokenize(doc) if token in vector_dict]
        if(len(vlist) < 1):
            continue 
        else:
            aggregated_doc_vectors[aggfunc][index] = aggregate_vector_list(vlist, aggfunc)

del word_embeddings
gc.collect()

In [None]:
import pickle

aggregated_doc_vectors_file = 'aggregated_doc_vectors.pkl'

# # Load aggregated_doc_vectors from disk
# with open(aggregated_doc_vectors_file, 'rb') as f:
#     aggregated_doc_vectors = pickle.load(f)
# print("Loaded aggregated_doc_vectors from disk.")

# Save aggregated_doc_vectors to disk
with open(aggregated_doc_vectors_file, 'wb') as f:
    pickle.dump(aggregated_doc_vectors, f)
print("Saved aggregated_doc_vectors to disk.")

### 3. Aggregate the query
Aggregate the query and find the most similar documents using cosine distance between the query's vector and document's aggregated vector.

Are they seem to relevant?

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

query = "EPFL"

def aggregate_query(query, aggfunc):
    # Raise an error message for the case when there is no words in the query that is included in the vocabulary
    # This should return a vector of shape (1, word_embeddings.shape[1])
    tokens = fasttext.tokenize(query)
    if(len(tokens) == 1): # only one term, find its associated vector
        if(tokens[0] in vocabulary):
            return vector_dict[tokens[0]]
    elif(len(tokens) > 1): # more than one term, get list of associated vectors for each term
        vlist = []
        for token in tokens:
            if (token in vocabulary):
                vlist.append(vector_dict[token])
        
        return aggregate_vector_list(vlist, aggfunc) # return aggregated vector according to aggfunc method of aggregating
    else:
        print("%s is not in the vocabulary." % (query))
    
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    # Calculate the similarity with each document vector. 
    # Hint: Cosine similarity function takes a matrix as input so you do not need to loop through each document vector.
    sim = cosine_similarity(query_vector.reshape((1,-1)), aggregated_doc_vectors[aggfunc])
    
    # Rank the document vectors according to their cosine similarity with the query vector and return topk indexes
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order, along last axis
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes


def search_vec_embeddings(query, topk = 10, aggfunc = 'mean'):
    query_vector = aggregate_query(query, aggfunc)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    # Print the top k documents
    indexes = indexes[0:topk]

    print(f"Document indexes retrieved : {indexes}")
    docids_retrieved = [doc_index_to_docid[index] for index in indexes]
    print(f"Docids retrieved : {docids_retrieved}")
    for index in indexes:
        print("---------")
        print(original_documents[index])
        print("---------")


In [None]:
search_vec_embeddings("What other companies did the FRC investigate KPMG's role in?", aggfunc = 'mean')

### 4. Compare the results with the vector space retrieval

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# VECTOR SPACE RETRIEVAL (From Exercise 1)
# Retrieval oracle 
from operator import itemgetter
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df = 1, stop_words = 'english')
features = tf.fit_transform(original_documents)
npm_tfidf = features.todense()

# Return all document ids that that have cosine similarity with the query larger than a threshold
def search_vec_sklearn(query, topk = 10, features = features, threshold=0.1):
    new_features = tf.transform([query])
    cosine_similarities = cosine_similarity(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
        if cos_sim < threshold or i >= topk:
            break
        doc_ids.append(related_docs_indices[i])
    
    for index in doc_ids:
        print(original_documents[index])

In [None]:
search_vec_embeddings('EPFL', aggfunc = 'mean')
print("---------------------------------")
search_vec_sklearn("EPFL")

In [None]:
def read_document_by_docid(docid, file_path='actual_data/corpus.json/corpus.json'):
    # Load the JSON file
    with open(file_path, 'r') as f:
        documents = json.load(f)
    
    for doc in documents:
        if doc['docid'] == docid:
            return doc
    
    return None

docid_to_find = 'doc-en-792955'
document = read_document_by_docid(docid_to_find)

if document:
    print(f"Document with docid {docid_to_find}:")
    print(document)
else:
    print(f"No document found with docid {docid_to_find}")

### Question
You will realize that not all the words in your queries are in the vocabulary, so your queries fail to retrieve any documents. Think of possible solutions to overcome this problem.