# Sentence Transformers + Cross Encoders
## 2-Stage Inference Pipeline (Long Trained)

In [1]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [2]:
!cp "/kaggle/input/sentence-transformers-lib-for-2023-01-23/sentence-transformers-2.2.2.piplib" "sentence-transformers-2.2.2.tar.gz"
!pip install sentence-transformers-2.2.2.tar.gz --no-dependencies -qqq

In [2]:
# =========================================================================================
# Libraries
# =========================================================================================
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import json

from cuml.neighbors import NearestNeighbors
import cupy as cp

from sentence_transformers import models, losses
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from torch.utils.data import DataLoader
import torch

%env TOKENIZERS_PARALLELISM=true
%env TRANSFORMERS_NO_ADVISORY_WARNINGS=true

env: TOKENIZERS_PARALLELISM=true


In [3]:
def get_neighbors(topic_df,
                  content_df,
                  config_obj):
    # Create unsupervised model to extract embeddings
    model = SentenceTransformer(config_obj["unsupervised_model"]["save_name"])
    model = model.to("cuda")

    # Predict
    topics_preds = model.encode(topic_df["model_input"],
                                show_progress_bar=True,
                                convert_to_tensor=True)
    topics_preds_gpu = cp.asarray(topics_preds)

    content_preds = model.encode(content_df["model_input"],
                                 show_progress_bar=True,
                                 convert_to_tensor=True,
                                 batch_size=100)
    content_preds_gpu = cp.asarray(content_preds)

    # Release memory
    torch.cuda.empty_cache()
    gc.collect()

    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors=config_obj["unsupervised_model"]["top_n"],
                                       metric='cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance=False)
    predictions = []
    for k in tqdm(range(len(indices))):
        pred = indices[k]
        p = ' '.join([content_df.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    topic_df['predictions'] = predictions

    # Release memory
    del topics_preds, content_preds, topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices, model
    gc.collect()
    torch.cuda.empty_cache()
    gc.collect()

    return topic_df, content_df



def build_training_set(topic_df,
                       content_df,
                       mode="local"):
    # Create lists for training
    topics_ids = []
    content_ids = []
    title1 = []
    title2 = []
    targets = []
    # Iterate over each topic
    for k in tqdm(range(len(topic_df))):
        row = topic_df.iloc[k]
        topics_id = row['id']
        topics_title = row['model_input']
        predictions = row['predictions'].split(' ')

        if mode == "local":
            ground_truth = row['content_ids'].split(' ')

        for pred in predictions:
            content_title = content_df.loc[pred, 'model_input']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            # If pred is in ground truth, 1 else 0
            if mode == "local":
                if pred in ground_truth:
                    targets.append(1)
                else:
                    targets.append(0)
    # Build training dataset
    train = pd.DataFrame(
        {'topics_ids': topics_ids,
         'content_ids': content_ids,
         'model_input1': title1,
         'model_input2': title2
         }
    )
    if mode == "local":
        train["target"] = targets

    return train

def read_data(data_path,
              config_obj,
              read_mode="all"):
    topics = pd.read_csv(data_path + 'topics.csv')
    content = pd.read_csv(data_path + 'content.csv')
    if read_mode != "all":
        correlations = pd.read_csv(data_path + 'correlations.csv')
    else:
        correlations = None
    topic_trees = generate_topic_tree(topics)

    if read_mode != "all":
        splits = pd.read_csv("train_test_splits.csv")
        topics = topics[topics.id.isin(splits[splits.fold == read_mode].id)].reset_index(drop=True)

    topics = topics.merge(topic_trees, how="left", on="id")
    del topic_trees
    gc.collect()

    generate_topic_model_input(input_df=topics,
                               seq_len=config_obj["unsupervised_model"]["seq_len"])
    generate_content_model_input(input_df=content,
                                 seq_len=config_obj["unsupervised_model"]["seq_len"])

    # Sort by title length to make inference faster
    topics['length'] = topics['title'].apply(lambda x: len(x))
    content['length'] = content['title'].apply(lambda x: len(x))
    topics.sort_values('length', inplace=True)
    content.sort_values('length', inplace=True)

    # Drop cols
    topics.drop(['length'], axis=1,
                inplace=True)
    content.drop(['length'], axis=1,
                 inplace=True)
    # Reset index
    topics.reset_index(drop=True, inplace=True)
    content.reset_index(drop=True, inplace=True)
    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")
    if read_mode != "all":
        print(f"correlations.shape: {correlations.shape}")

    return topics, content, correlations


def generate_topic_model_input(input_df,
                               seq_len=128):
    """
    :param input_df: Input topic dataframe
    :return: Dataframe with additional model input column
    """

    input_df.fillna("", inplace=True)

    input_df["model_input"] = ("[<[topic_lang]>] " + input_df["language"].astype(str) +
                  # " [<[topic_lvl]>] " + input_df["level"].astype(str) +
                  " [<[topic_title]>] " + input_df["title"].astype(str) +
                  " [<[topic_tree]>] " + input_df["topic_tree"].astype(str) +
                  " [<[topic_desc]>] " + input_df["description"].astype(str)).str.lower()#.str.split().apply(lambda x: " ".join(x[:seq_len]))

    input_df.drop(['description', 'channel', 'category',
                   'level', 'parent', 'has_content'],
                  axis=1,
                  inplace=True)
    gc.collect()


def generate_content_model_input(input_df,
                                 seq_len=128):
    """
    :param input_df: Input content dataframe
    :return: Dataframe with additional model input column
    """

    input_df.fillna("", inplace=True)

    input_df["model_input"] = ("[<[cntnt_lang]>] " + input_df["language"].astype(str) +
                  " [<[cntnt_kind]>] " + input_df["kind"].astype(str) +
                  " [<[cntnt_title]>] " + input_df["title"].astype(str) +
                  " [<[cntnt_desc]>] " + input_df["description"].astype(str) +
                  " [<[cntnt_text]>] " + input_df["text"].astype(str)).apply(lambda x: " ".join(x.split()[:512])).str.lower()

    input_df.drop(['description', 'kind', 'text', 'copyright_holder', 'license'],
                  axis=1,
                  inplace=True)
    gc.collect()


def generate_topic_tree(input_topic_df):
    df = pd.DataFrame()

    for channel in tqdm(input_topic_df['channel'].unique()):
        channel_df = input_topic_df[(input_topic_df['channel'] == channel)].reset_index(drop=True)
        for level in sorted(channel_df.level.unique()):
            # For level 0, it first creates a topic tree column which is the title of that topic.
            if level == 0:
                topic_tree = channel_df[channel_df['level'] == level]['title'].astype(str)
                topic_tree_df = pd.DataFrame([channel_df[channel_df['level'] == level][['id']], topic_tree.values]).T
                topic_tree_df.columns = ['child_id', 'topic_tree']
                channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop(
                    ['child_id'], axis=1)

            # Once the topic tree column has been created, the parent node and child node is merged on parent_id = child_id
            topic_df_parent = channel_df[channel_df['level'] == level][['id', 'title', 'parent', 'topic_tree']]
            topic_df_parent.columns = 'parent_' + topic_df_parent.columns

            topic_df_child = channel_df[channel_df['level'] == level + 1][['id', 'title', 'parent', 'topic_tree']]
            topic_df_child.columns = 'child_' + topic_df_child.columns

            topic_df_merged = topic_df_parent.merge(topic_df_child, left_on='parent_id', right_on='child_parent')[
                ['child_id', 'parent_id', 'parent_title', 'child_title', 'parent_topic_tree']]

            # Topic tree is parent topic tree + title of the current child on that level
            topic_tree = topic_df_merged['parent_topic_tree'].astype(str) + ' > ' + topic_df_merged[
                'child_title'].astype(str)

            topic_tree_df = pd.DataFrame([topic_df_merged['child_id'].values, topic_tree.values]).T
            topic_tree_df.columns = ['child_id', 'topic_tree']

            channel_df = channel_df.merge(topic_tree_df, left_on='id', right_on='child_id', how='left').drop(
                ['child_id'], axis=1)
            if 'topic_tree_y' in list(channel_df.columns):
                channel_df['topic_tree'] = channel_df['topic_tree_x'].combine_first(channel_df['topic_tree_y'])
                channel_df = channel_df.drop(['topic_tree_x', 'topic_tree_y'], axis=1)

        df = pd.concat([df, channel_df], ignore_index=True)
    return df[["id", "topic_tree"]]

In [4]:
config = {
  "unsupervised_model": {
    "save_name": "trained_models/unsupervised/paraphrase-multilingual-mpnet-base-v2",
    "seq_len": 128,
    "top_n": 50,
    "seed": 42
  },
  "supervised_model": {
    "save_name": "trained_models/supervised/paraphrase-multilingual-mpnet-base-v2",
    "threshold": 0.0475,
    "seq_len": 128
  }
}

DATA_PATH = "../raw_data/"

In [5]:
submission_df = pd.read_csv(DATA_PATH + "sample_submission.csv")
submission_df.head()

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


## 1. Retrieve

In [6]:
# Read data
topics, content, correlations = read_data(data_path=DATA_PATH,
                                          config_obj=config,
                                          read_mode="all")

  0%|          | 0/171 [00:00<?, ?it/s]

 
--------------------------------------------------
topics.shape: (76972, 5)
content.shape: (154047, 4)


In [7]:
topics = topics[topics.id.isin(submission_df.topic_id)].reset_index(drop=True)

In [8]:
# Run nearest neighbors
topics, content = get_neighbors(topic_df=topics,
                                content_df=content,
                                config_obj=config)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1541 [00:00<?, ?it/s]

 
Training KNN model...


  0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
# Set id as index for content
content.set_index('id', inplace = True)

In [10]:
df = build_training_set(topic_df=topics,
                        content_df=content,
                        mode="kaggle")

  0%|          | 0/5 [00:00<?, ?it/s]

In [11]:
df.head()

Unnamed: 0,topics_ids,content_ids,model_input1,model_input2
0,t_00069b63a70a,c_11a1dc0bfb99,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...
1,t_00069b63a70a,c_2593347819cc,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...
2,t_00069b63a70a,c_27c76064baeb,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...
3,t_00069b63a70a,c_7861f30ff74e,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...
4,t_00069b63a70a,c_71d6bae3f656,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...


In [12]:
del topics, content

## 2. Re-Rank

In [13]:
model = CrossEncoder(config["supervised_model"]["save_name"],
                    num_labels=1,
                    max_length=config["supervised_model"]["seq_len"])

In [14]:
preds = model.predict(df[["model_input1", "model_input2"]].values,
                      show_progress_bar=True,
                      batch_size=96)

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

In [15]:
df["pred_score"] = preds

In [16]:
df.head()

Unnamed: 0,topics_ids,content_ids,model_input1,model_input2,pred_score
0,t_00069b63a70a,c_11a1dc0bfb99,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...,0.999826
1,t_00069b63a70a,c_2593347819cc,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...,5e-05
2,t_00069b63a70a,c_27c76064baeb,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...,5e-05
3,t_00069b63a70a,c_7861f30ff74e,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...,4.8e-05
4,t_00069b63a70a,c_71d6bae3f656,[<[topic_lang]>] en [<[topic_title]>] transcri...,[<[cntnt_lang]>] en [<[cntnt_kind]>] document ...,4.6e-05


In [17]:
sorted_pred_df = df.sort_values(by="pred_score", ascending=False).reset_index(drop=True)

preds_thr_df = sorted_pred_df[["topics_ids","content_ids"]][sorted_pred_df.pred_score >= config["supervised_model"]["threshold"]].\
                                groupby("topics_ids")["content_ids"].apply(lambda x: " ".join(x)).rename("pred_content_ids").reset_index()

## Create Submission

In [18]:
submission_df = submission_df.merge(preds_thr_df,
                                    how="left",
                                    right_on="topics_ids",
                                    left_on="topic_id")[["topic_id", "pred_content_ids"]]
submission_df.rename(columns={"pred_content_ids": "content_ids"}, inplace=True)

In [19]:
submission_df

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_5bc0e1e2cba0 c_376c5a8eb028 c...
1,t_00068291e9a4,c_ac1672cdcd2c c_89ce9367be10 c_ebb7fdf10a7e c...
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_d7a0d7eaf799 c_5e375cf14c47 c_1c57a1316568 c...
4,t_4054df11a74e,c_3695c5dc1df6 c_f2d184a98231


In [20]:
submission_df.to_csv("submission.csv", index=False)