# Simple test
Find the most similar terms for a given term. The similarity between two terms is defined as the cosine similarity between their corresponding word embeddings

In [None]:
# Load model vocabulary and embeddings 
import fasttext
import numpy as np
import gc

model = fasttext.load_model("models/fasttext_unsupervised_cbow_dim100_mini.bin")

vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

# Clean memory
del model
gc.collect()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(input_term, word_embeddings, vocabulary, num_terms=5):
    # Create dict to associate embedding to each term in vocabulary
    term_embeddings_dict = {} 
    for i,term in enumerate(vocabulary):
        term_embeddings_dict[term] = word_embeddings[i]
    
    # Find input_term in embeddings dict
    if input_term not in term_embeddings_dict:
        return "Term not in the vocabulary"
    input_term_embedding = term_embeddings_dict[input_term]

    # Calculate similarity with each term in vocabulary
    term_similarities = []
    for term, embedding in term_embeddings_dict.items():
        term_similarities.append([term, cosine_similarity(input_term_embedding.reshape((1,-1)), embedding.reshape((1,-1)))]) # reshape embedding into 2D array with 1 line as expected by cosine_similarity function
        
    sorted_terms = sorted(term_similarities, key = lambda x: -1 * x[1])[0:num_terms] # sort by decreasing similarity score, select num_terms first elements

    return sorted_terms
    

find_most_similar('ireland', word_embeddings, vocabulary, num_terms=5)

In [None]:
# Clean memory
del vocabulary
del word_embeddings
gc.collect()

# Inference with a given query

## Load the model, corpus data and the aggregated vectors for each document

In [1]:
model_name = "fasttext_unsupervised_cbow_dim300_preprocessing"

In [None]:
# Load model vocabulary and embeddings 
import fasttext
import numpy as np
import gc

model = fasttext.load_model(f"models/{model_name}.bin")

vocabulary = model.words
word_embeddings = np.array([model[word] for word in vocabulary])

# Create a dictionary of vectors for easier search
vector_dict = dict(zip(vocabulary, word_embeddings))

# Clean memory
del vocabulary
del word_embeddings
gc.collect()

In [None]:
# Load the aggregated vectors for each document from disk
import pickle

aggregated_docs_vectors_file = f'aggregated_docs_vectors/adv_{model_name}.pkl'

# Load aggregated_docs_vectors from disk
with open(aggregated_docs_vectors_file, 'rb') as f:
    aggregated_docs_vectors = pickle.load(f)
print("Loaded aggregated_docs_vectors from disk.")

In [4]:
# Load dict to match document index and corresponding docid
import json
with open('doc_index_to_docid.json', 'r') as f:
    doc_index_to_docid = json.load(f)
doc_index_to_docid = {int(key): value for key, value in doc_index_to_docid.items()} # reconvert keys to int

## Aggregate the query
Aggregate the query and find the most similar documents using cosine distance between the query's vector and document's aggregated vector

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

def aggregate_vector_list(vlist, aggfunc):
    if aggfunc == 'max':
        return np.array(vlist).max(axis=0)
    elif aggfunc == 'min':
        return np.array(vlist).min(axis=0)
    elif aggfunc == 'mean':
        return np.array(vlist).mean(axis=0)
    else:
        return np.zeros(np.array(vlist).shape[1])

def aggregate_query(query, aggfunc):
    # Raise an error message for the case when there is no words in the query that is included in the vocabulary
    # This should return a vector of shape (1, word_embeddings.shape[1])
    tokens = fasttext.tokenize(query)

    vlist = []
    for token in tokens:
        if token in vector_dict:
            vlist.append(vector_dict[token])
        else:
            print(f"{token} is not in the vocabulary")
            vlist.append(model.get_word_vector(token)) # use n-grams of word to obtain a vector for this out-of-vocabulary word

    return aggregate_vector_list(vlist, aggfunc)


In [6]:
def get_most_similar_documents(query_vector, aggfunc, k = 5):
    # Calculate the similarity with each document vector. 
    sim = cosine_similarity(query_vector.reshape((1,-1)), aggregated_docs_vectors[aggfunc])
    
    # Rank the document vectors according to their cosine similarity with the query vector and return topk indexes
    indexes = np.argsort(sim, axis=-1, kind='quicksort', order=None) # This is sorted in ascending order, along last axis
    indexes = indexes[0]
    indexes = indexes[::-1] # Convert to descending
    return indexes


def search_vec_embeddings(query, topk = 10, aggfunc = 'mean'):
    query_vector = aggregate_query(query, aggfunc)
    indexes = get_most_similar_documents(query_vector, aggfunc)
    indexes = indexes[0:topk]
    docids_retrieved = [doc_index_to_docid[index] for index in indexes]
    return docids_retrieved

In [None]:
from preprocessing import clean_text
query = "What is the syntax for the shorthand of the conditional operator in PHP 5.3?"
cleaned_query = clean_text(query) # APPLY PREPROCESSING
docids_retrieved = search_vec_embeddings(query=cleaned_query, aggfunc = 'mean')
print(f"Docids retrieved : {docids_retrieved}")


# Calculate recall on dev set

In [8]:
# Function to calculate recall@10
def calculate_recall_at_k(retrieved_docs, relevant_docs, k=10):
    retrieved_set = set(retrieved_docs[:k])
    relevant_set = set(relevant_docs)
    intersection = retrieved_set.intersection(relevant_set)
    recall = len(intersection) / len(relevant_set)
    return recall

In [9]:
import pandas as pd

# Load dev set
dev_set_path = '../../data/dev.csv'
dev_set = pd.read_csv(dev_set_path)

In [10]:
# Positive/Negative docs to list
def docs_to_list(docs):
    if isinstance(docs, str):
        if docs.startswith('[') and docs.endswith(']'):
            return eval(docs)
        else:
            return [docs]
    return docs

In [None]:
# OPTIONAL: select only queries whose postitive document is in the randomly selected subset

docids_file_path = 'selected_docids.json'

with open(docids_file_path, 'r') as f:
    selected_docids = json.load(f)

dev_set['positive_docs'] = dev_set['positive_docs'].apply(docs_to_list)
dev_set['negative_docs'] = dev_set['negative_docs'].apply(docs_to_list)

# Filter the queries to keep only those with docid in selected_docids
filtered_dev_set = dev_set[dev_set['positive_docs'].apply(lambda docs: any(doc in selected_docids for doc in docs))]

# Print some examples to check
print(filtered_dev_set.head())

# Replace variable
del dev_set
dev_set = filtered_dev_set
del filtered_dev_set
gc.collect()

In [None]:
# Calculate recall@10 for each query in dev set
recalls = []
lang_recalls = {}
for index, row in dev_set.iterrows():
    query = row['query']
    lang = row['lang']
    cleaned_query = clean_text(query) # APPLY PREPROCESSING
    positive_docs = docs_to_list(row['positive_docs']) # convert str to python list
    retrieved_docs = search_vec_embeddings(cleaned_query, topk=10, aggfunc='mean')
    recall = calculate_recall_at_k(retrieved_docs, positive_docs, k=10)
    recalls.append(recall)

    # Add recall to specific langage
    if lang not in lang_recalls:
        lang_recalls[lang] = []
    lang_recalls[lang].append(recall)

# Calculate average recall
mean_recall_at_10 = np.mean(recalls)
print(f"Mean Recall@10: {mean_recall_at_10:.4f}")

# Calculate average recall for each language
for lang, lang_recall_list in lang_recalls.items():
    mean_lang_recall = np.mean(lang_recall_list)
    print(f"Mean Recall@10 for {lang}: {mean_lang_recall:.4f}")

# MLflow logging

In [None]:
import mlflow

try:
    model_name
except NameError:
    model_name = "fasttext_unsupervised_cbow_dim100"

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("word_embedding")

# Start an MLflow run
with mlflow.start_run(run_name=model_name):
    mlflow.log_param("framework", "fasttext")
    mlflow.log_param("method", "unsupervised")
    mlflow.log_param("model", "cbow")
    mlflow.log_param("input", "full_corpus")

    mlflow.log_metric("dimension", 300)
    mlflow.log_metric("recall_at10_dev", mean_recall_at_10)
    for lang, lang_recall_list in lang_recalls.items():
        mean_lang_recall = np.mean(lang_recall_list)
        mlflow.log_metric(f"recall_at10_dev_{lang}", mean_lang_recall)


# Predictions on test set

In [50]:
# Load test set
test_set_path = '../../data/test.csv'
test_set = pd.read_csv(test_set_path)

In [None]:
# Get documents retrieved for each query in test set
predicted_docs = []
for index, row in test_set.iterrows():
    query_id = row['id']
    query = row['query']
    retrieved_docs = search_vec_embeddings(query, topk=10, aggfunc='mean')
    predicted_docs.append((query_id, retrieved_docs))

# Create Dataframe with results
results_df = pd.DataFrame(predicted_docs, columns=['id', 'docids'])

# Save to csv
results_df.to_csv('predicted_docs.csv', index=False)