# 1. Training Unsupervised SentenceTransformer

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from tqdm import trange

from sentence_transformers import SentenceTransformer, models, InputExample, losses
from sentence_transformers import evaluation
from sentence_transformers.util import cos_sim

from typing import List, Tuple, Dict, Set, Callable

from datasets import Dataset
import torch
from torch.utils.data import DataLoader
from torch import Tensor
import heapq

import warnings
warnings.filterwarnings('ignore')

# Custom libraries
from utils.unsupervised_utils import generate_topic_model_input, generate_content_model_input, read_data
from utils.utils import read_config

In [3]:
%env TOKENIZERS_PARALLELISM=true
%env TRANSFORMERS_NO_ADVISORY_WARNINGS=false

env: TOKENIZERS_PARALLELISM=true


In [4]:
config = read_config()

In [5]:
DATA_PATH = "../raw_data/"

In [6]:
topics, content, correlations = read_data(data_path=DATA_PATH,
                                          config_obj=config,
                                          read_mode="train")

topics.rename(columns=lambda x: "topic_" + x, inplace=True)
content.rename(columns=lambda x: "content_" + x, inplace=True)

correlations["content_id"] = correlations["content_ids"].str.split(" ")
corr = correlations.explode("content_id").drop(columns=["content_ids"])

corr = corr.merge(topics, how="left", on="topic_id")
corr = corr.merge(content, how="left", on="content_id")

corr["set"] = corr[["topic_model_input", "content_model_input"]].values.tolist()
train_df = pd.DataFrame(corr["set"])

dataset = Dataset.from_pandas(train_df)

train_examples = []
train_data = dataset["set"]
n_examples = dataset.num_rows

for i in range(n_examples):
    example = train_data[i]
    if example[0] == None:
        continue        
    train_examples.append(InputExample(texts=[str(example[0]), str(example[1])]))

100%|█████████████████████████████████████████| 171/171 [00:07<00:00, 23.26it/s]


 
--------------------------------------------------
topics.shape: (55266, 4)
content.shape: (154047, 3)
correlations.shape: (61517, 2)


## Setting-up the Evaluation

In [7]:
test_topics, test_content, test_correlations = read_data(data_path=DATA_PATH,
                                                         config_obj=config,
                                                         read_mode="test")

test_correlations["content_id"] = test_correlations["content_ids"].str.split(" ")
test_correlations = test_correlations[test_correlations.topic_id.isin(test_topics.id)].reset_index(drop=True)
test_correlations["content_id"] = test_correlations["content_id"].apply(set)
test_correlations = test_correlations[["topic_id", "content_id"]]

100%|█████████████████████████████████████████| 171/171 [00:07<00:00, 23.13it/s]


 
--------------------------------------------------
topics.shape: (6251, 4)
content.shape: (154047, 3)
correlations.shape: (61517, 2)


In [8]:
ir_relevant_docs = {
    row['topic_id']: row['content_id'] for i, row in tqdm(test_correlations.iterrows())
}

0it [00:00, ?it/s]

In [9]:
unq_test_topics = test_correlations.explode("topic_id")[["topic_id"]].reset_index(drop=True).drop_duplicates().reset_index(drop=True)
unq_test_topics = unq_test_topics.merge(test_topics[["id", "model_input"]], how="left", left_on="topic_id",
                       right_on="id").drop("id", 1)

ir_queries = {
    row['topic_id']: row['model_input'] for i, row in tqdm(unq_test_topics.iterrows())
}

0it [00:00, ?it/s]

In [10]:
all_topics, all_content, _ = read_data(data_path=DATA_PATH,
                                                         config_obj=config,
                                                         read_mode="all")

unq_contents = correlations.explode("content_id")[["content_id"]].reset_index(drop=True).drop_duplicates().reset_index(drop=True)
unq_contents = unq_contents.merge(all_content[["id", "model_input"]], how="left", left_on="content_id",
                       right_on="id").drop("id", 1)

ir_corpus = {
    row['content_id']: row['model_input'] for i, row in tqdm(unq_contents.iterrows())
}

100%|█████████████████████████████████████████| 171/171 [00:07<00:00, 22.96it/s]


 
--------------------------------------------------
topics.shape: (76972, 4)
content.shape: (154047, 3)


0it [00:00, ?it/s]

In [11]:
class InformationRetrievalEvaluator(evaluation.SentenceEvaluator):
    """
    This class evaluates an Information Retrieval (IR) setting.
    Given a set of queries and a large corpus set. It will retrieve for each query the top-k most similar document. It measures
    Mean Reciprocal Rank (MRR), Recall@k, and Normalized Discounted Cumulative Gain (NDCG)
    """

    def __init__(self,
                 queries: Dict[str, str],  #qid => query
                 corpus: Dict[str, str],  #cid => doc
                 relevant_docs: Dict[str, Set[str]],  #qid => Set[cid]
                 corpus_chunk_size: int = 50000,
                 mrr_at_k: List[int] = [10],
                 ndcg_at_k: List[int] = [10],
                 accuracy_at_k: List[int] = [1, 3, 5, 10],
                 precision_recall_at_k: List[int] = [1, 3, 5, 10],
                 map_at_k: List[int] = [100],
                 show_progress_bar: bool = False,
                 batch_size: int = 32,
                 name: str = '',
                 write_csv: bool = True,
                 score_functions: List[Callable[[Tensor, Tensor], Tensor] ] = {'cos_sim': cos_sim},       #Score function, higher=more similar
                 main_score_function: str = None
                 ):

        self.queries_ids = []
        for qid in queries:
            if qid in relevant_docs and len(relevant_docs[qid]) > 0:
                self.queries_ids.append(qid)

        self.queries = [queries[qid] for qid in self.queries_ids]

        self.corpus_ids = list(corpus.keys())
        self.corpus = [corpus[cid] for cid in self.corpus_ids]

        self.relevant_docs = relevant_docs
        self.corpus_chunk_size = corpus_chunk_size
        self.precision_recall_at_k = precision_recall_at_k

        self.show_progress_bar = show_progress_bar
        self.batch_size = batch_size
        self.name = name
        self.write_csv = write_csv
        self.score_functions = score_functions
        self.score_function_names = sorted(list(self.score_functions.keys()))
        self.main_score_function = main_score_function

        if name:
            name = "_" + name

        self.csv_file: str = "Information-Retrieval_evaluation" + name + "_results.csv"
        self.csv_headers = ["epoch", "steps"]

        for score_name in self.score_function_names:
            for k in precision_recall_at_k:
                self.csv_headers.append("{}-Precision@{}".format(score_name, k))
                self.csv_headers.append("{}-Recall@{}".format(score_name, k))

    def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1, *args, **kwargs) -> float:
        if epoch != -1:
            out_txt = " after epoch {}:".format(epoch) if steps == -1 else " in epoch {} after {} steps:".format(epoch, steps)
        else:
            out_txt = ":"

        scores = self.compute_metrices(model, *args, **kwargs)

        # Write results to disc
        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                fOut = open(csv_path, mode="w", encoding="utf-8")
                fOut.write(",".join(self.csv_headers))
                fOut.write("\n")

            else:
                fOut = open(csv_path, mode="a", encoding="utf-8")

            output_data = [epoch, steps]
            for name in self.score_function_names:
                for k in self.precision_recall_at_k:
                    output_data.append(scores[name]['precision@k'][k])
                    output_data.append(scores[name]['recall@k'][k])

            fOut.write(",".join(map(str, output_data)))
            fOut.write("\n")
            fOut.close()

        if self.main_score_function is None:
            return max([scores[name]['recall@k'][max(self.precision_recall_at_k)] for name in self.score_function_names])
        else:
            return scores[self.main_score_function]['recall@k'][max(self.precision_recall_at_k)]

    def compute_metrices(self, model, corpus_model = None, corpus_embeddings: Tensor = None) -> Dict[str, float]:
        if corpus_model is None:
            corpus_model = model

        max_k = max(self.precision_recall_at_k)

        # Compute embedding for the queries
        query_embeddings = model.encode(self.queries, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_tensor=True)

        queries_result_list = {}
        for name in self.score_functions:
            queries_result_list[name] = [[] for _ in range(len(query_embeddings))]

        #Iterate over chunks of the corpus
        for corpus_start_idx in trange(0, len(self.corpus), self.corpus_chunk_size, desc='Corpus Chunks', disable=not self.show_progress_bar):
            corpus_end_idx = min(corpus_start_idx + self.corpus_chunk_size, len(self.corpus))

            #Encode chunk of corpus
            if corpus_embeddings is None:
                sub_corpus_embeddings = corpus_model.encode(self.corpus[corpus_start_idx:corpus_end_idx], show_progress_bar=False, batch_size=self.batch_size, convert_to_tensor=True)
            else:
                sub_corpus_embeddings = corpus_embeddings[corpus_start_idx:corpus_end_idx]

            #Compute cosine similarites
            for name, score_function in self.score_functions.items():
                pair_scores = score_function(query_embeddings, sub_corpus_embeddings)

                #Get top-k values
                pair_scores_top_k_values, pair_scores_top_k_idx = torch.topk(pair_scores, min(max_k, len(pair_scores[0])), dim=1, largest=True, sorted=False)
                pair_scores_top_k_values = pair_scores_top_k_values.cpu().tolist()
                pair_scores_top_k_idx = pair_scores_top_k_idx.cpu().tolist()

                for query_itr in range(len(query_embeddings)):
                    for sub_corpus_id, score in zip(pair_scores_top_k_idx[query_itr], pair_scores_top_k_values[query_itr]):
                        corpus_id = self.corpus_ids[corpus_start_idx+sub_corpus_id]
                        if len(queries_result_list[name][query_itr]) < max_k:
                            heapq.heappush(queries_result_list[name][query_itr], (score, corpus_id))  # heaqp tracks the quantity of the first element in the tuple
                        else:
                            heapq.heappushpop(queries_result_list[name][query_itr], (score, corpus_id))

        for name in queries_result_list:
            for query_itr in range(len(queries_result_list[name])):
                for doc_itr in range(len(queries_result_list[name][query_itr])):
                    score, corpus_id = queries_result_list[name][query_itr][doc_itr]
                    queries_result_list[name][query_itr][doc_itr] = {'corpus_id': corpus_id, 'score': score}


        #Compute scores
        scores = {name: self.compute_metrics(queries_result_list[name]) for name in self.score_functions}

        #Output
        for name in self.score_function_names:
            self.output_scores(scores[name])

        return scores


    def compute_metrics(self, queries_result_list: List[object]):
        # Init score computation values
        precisions_at_k = {k: [] for k in self.precision_recall_at_k}
        recall_at_k = {k: [] for k in self.precision_recall_at_k}

        # Compute scores on results
        for query_itr in range(len(queries_result_list)):
            query_id = self.queries_ids[query_itr]

            # Sort scores
            top_hits = sorted(queries_result_list[query_itr], key=lambda x: x['score'], reverse=True)
            query_relevant_docs = self.relevant_docs[query_id]

            # Precision and Recall@k
            for k_val in self.precision_recall_at_k:
                num_correct = 0
                for hit in top_hits[0:k_val]:
                    if hit['corpus_id'] in query_relevant_docs:
                        num_correct += 1

                precisions_at_k[k_val].append(num_correct / k_val)
                recall_at_k[k_val].append(num_correct / len(query_relevant_docs))


        for k in precisions_at_k:
            precisions_at_k[k] = np.mean(precisions_at_k[k])

        for k in recall_at_k:
            recall_at_k[k] = np.mean(recall_at_k[k])


        return {'precision@k': precisions_at_k, 'recall@k': recall_at_k}


    def output_scores(self, scores):
        return


    @staticmethod
    def compute_dcg_at_k(relevances, k):
        dcg = 0
        for i in range(min(len(relevances), k)):
            dcg += relevances[i] / np.log2(i + 2)  #+2 as we start our idx at 0
        return dcg


In [12]:
evaluator = InformationRetrievalEvaluator(
    ir_queries, ir_corpus, ir_relevant_docs,
    show_progress_bar=True,
    main_score_function="cos_sim",
     precision_recall_at_k=[5, 10, 25, 50, 100],
    name='K12-local-test-unsupervised'
)

### Training

In [13]:
train_dataloader = DataLoader(train_examples,
                              shuffle=True,
                              batch_size=config["unsupervised_model"]["batch_size"])

In [14]:
TARGET_MODEL = config["unsupervised_model"]["base_name"]
OUT_MODEL = config["unsupervised_model"]["save_name"]
TARGET_MODEL, OUT_MODEL

('trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2',
 'trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2')

In [15]:
model = SentenceTransformer(TARGET_MODEL)
model.max_seq_length = config["unsupervised_model"]["seq_len"]

word_embedding_model = model._first_module()

tokens = [
    "[TOPIC_LANG]", "[TOPIC_LVL]", "[TOPIC_TREE]", "[TOPIC_TITLE]", "[TOPIC_DESC]",
    "[CNTNT_LANG]", "[CNTNT_KIND]", "[CNTNT_TITLE]", "[CNTNT_DESC]", "[CNTNT_TEXT]"
]

word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

In [16]:
train_loss = losses.MultipleNegativesRankingLoss(model=model)

#k% of train data
num_epochs = config["unsupervised_model"]["epochs"]
warmup_steps = int(len(train_dataloader) * config["unsupervised_model"]["warmup_ratio"]) 

In [17]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
#           evaluation_steps=400,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=OUT_MODEL,
          save_best_model=True,
         use_amp=True)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3981 [00:00<?, ?it/s]

Batches:   0%|          | 0/196 [00:00<?, ?it/s]


Corpus Chunks:   0%|                                      | 0/4 [00:00<?, ?it/s][A
Corpus Chunks:  25%|███████▌                      | 1/4 [00:23<01:09, 23.13s/it][A
Corpus Chunks:  50%|███████████████               | 2/4 [00:46<00:46, 23.39s/it][A
Corpus Chunks:  75%|██████████████████████▌       | 3/4 [01:10<00:23, 23.66s/it][A
Corpus Chunks: 100%|██████████████████████████████| 4/4 [01:12<00:00, 18.23s/it][A


Iteration:   0%|          | 0/3981 [00:00<?, ?it/s]


KeyboardInterrupt



In [None]:
topics.topic_model_input.str.split().apply(len).hist()

In [None]:
content.content_model_input.str.split().apply(len).hist()

In [None]:
content.content_model_input.iloc[5]