**The reference:**
* https://huggingface.co/blog/how-to-train-sentence-transformers
* https://www.kaggle.com/code/andtaichi/finetunig-sentencetransformer
* https://www.kaggle.com/code/quincyqiang/download-huggingface-pretrain-for-kaggle/notebook
* https://towardsdatascience.com/easy-kaggle-offline-submission-with-chaining-kernels-30bba5ea5c4d
* https://www.kaggle.com/c/severstal-steel-defect-detection/discussion/109679
* https://www.kaggle.com/code/jamiealexandre/sample-notebook-data-exploration/notebook

The pretrained model we use:
https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

**The highlights:**
1. Join the correlation with topic and content.
2. Concatenate "topic_title" and "topic_description" for each topic, and concate all the ancestor topics into "topic_full" field.
3. Concatenate 'content_title', 'content_description', "content_text" for each content into "content_full" field.
4. Feed "topic_full" and "content_full" as embeding pair into the model and fintune the model.
5. Using the fine-tuned model, generate the embeding for "topic_full" and "content_full", and put them into embeding dataset.
6. Create the faiss index by calling add_faiss_index.
7. For each topic that need to be predict, search in the embeding dataset with faiss index using content_full, find the nearest K content (current value is 20), then filter out the result.
8. Furhter filter out the result by calculating the cosine_sim between each content and the topic_full, and cutting off using Cosine_Cutoff (current value is larger than 0.99995).
9. Output the result.

In [None]:
# !pip install sentence-transformers
# !pip install faiss-gpu
# !pip install faiss-cpu
# !pip install tqdm
# !pip install nvidia-ml-py3
# !pip install accelerate
# !pip install scikit-learn

In [None]:
# !curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh |  bash
# !apt-get install -y --allow-unauthenticated git-lfs

In [None]:
# !git lfs install
# !git clone https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2
# # if you want to clone without large files – just their pointers
# # prepend your git clone with the following env var:
# !GIT_LFS_SKIP_SMUDGE=1

In [None]:
# !pip install transformers --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/transformers-4.26.1-py3-none-any.whl --no-index
# !pip install datasets --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/datasets-2.10.1-py3-none-any.whl --no-index
!pip install sentence-transformers --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/sentence-transformers-2.2.2.tar.gz --no-index
!pip install faiss-gpu  --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/faiss_gpu-1.7.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --no-index
!pip install faiss-cpu  --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/faiss_cpu-1.7.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl  --no-index
!pip install tqdm --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/tqdm-4.64.1-py2.py3-none-any.whl --no-index
!pip install nvidia-ml-py3 --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/nvidia-ml-py3-7.352.0.tar.gz --no-index
!pip install accelerate --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/accelerate-0.16.0-py3-none-any.whl --no-index
!pip install scikit-learn --find-links /kaggle/input/using-fine-tuned-sentencetransformer-env/scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --no-index

In [None]:
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import torch 
import transformers
import datasets
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoConfig, AutoTokenizer
from sentence_transformers import SentenceTransformer, models, InputExample, losses, util
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets, IterableDataset
from tqdm.auto import tqdm

In [None]:
print(f"transformers.__version__: {transformers.__version__}")
print(f"datasets.__version__: {datasets.__version__}")

In [None]:
transformers.logging.set_verbosity_debug()
datasets.disable_progress_bar()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
# model = SentenceTransformer("/kaggle/input/using-fine-tuned-sentencetransformer-env/paraphrase-multilingual-mpnet-base-v2")

In [None]:
# !ls -l

In [None]:
Refresh_Topic = True
Refresh_Train_Data = True
Train_model = True
Build_Embedding = True 
Calculate_Score = True 
Cal_Submission = True
Cosine_Cutoff = 0.99995
Nearest_K = 20
dver = 508

#kaggle setting
input_folder = "/kaggle/input"
output_folder = "/kaggle/working"
model_output_folder = "/kaggle/working"
model_input_folder = "/kaggle/input/using-fine-tuned-sentencetransformer-env"

#local setting
# input_folder = "./data/input"
# output_folder = "./data/output"
# model_output_folder = "./model/output"
# model_input_folder = "./model/input"

Topic_Full_Data_File = f"{output_folder}/df_topics_full_{dver}.csv"
Train_Data_File= f"{output_folder}/df_train_v{dver}.csv"
Embeddings_File = f'{output_folder}/embeddings_topics_dataset_v{dver}'

Submission_File = f"{output_folder}/submission_v{dver}.csv"
Base_Model_File = f"{model_input_folder}/paraphrase-multilingual-mpnet-base-v2"
Tuned_Model_File = f"{model_output_folder}/paraphrase-multilingual-mpnet-base-v{dver}-tuned"

print(f"Topic_Full_Data_File {Topic_Full_Data_File}")
print(f"Train_Data_File {Train_Data_File}")
print(f"Embeddings_File {Embeddings_File}")
print(f"Submission_File {Submission_File}")

In [None]:
# pd.set_option('display.max_columns', 9)
# pd.set_option('display.max_rows', 200)
# pd.set_option('display.min_rows', 10)
# pd.set_option("expand_frame_repr", True)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', 50)
# DATA_PATH = "/kaggle/input/learning-equality-curriculum-recommendations/"
DATA_PATH = f"{input_folder}/learning-equality-curriculum-recommendations/"
topics = pd.read_csv(DATA_PATH + "topics.csv")
content = pd.read_csv(DATA_PATH + "content.csv")
correlations = pd.read_csv(DATA_PATH + "correlations.csv")
# sample_submission = pd.read_csv(DATA_PATH + "sample_submission.csv")

print(f"DATA_PATH {DATA_PATH}")

In [None]:
df_topics = None

if ( not topics.columns[0].startswith("topic_")):
    print(f"renaming topics ...")
    topics.rename(columns=lambda x: "topic_" + x, inplace=True)
    content.rename(columns=lambda x: "content_" + x, inplace=True)

In [None]:
def get_topic_full(row):
    topic_title = str(row["topic_title"]) if pd.notna(row["topic_title"]) else ""
    topic_description = str(row["topic_description"]) if pd.notna(row["topic_description"]) else ""
    topic_full = "title: " + topic_title
    if (topic_description != ""):
        topic_full = topic_full + "\r\n" + "description: " + topic_description

    return topic_full

def get_parents(df, row):
    topic_id = row["topic_id"]
    topic_title_full = str(row["topic_title"]) if pd.notna(row["topic_title"]) else ""
    topic_parent = row["topic_parent"]
    topic_level = row["topic_level"]
    topic_full = get_topic_full(row)
    while not pd.isnull(topic_parent):
        subset = df.loc[df['topic_id'] == topic_parent]
        for index, r in subset.iterrows():
            t_full = get_topic_full(r)
            topic_full = t_full + "\r\n" + topic_full 
            t_title = str(row["topic_title"]) if pd.notna(row["topic_title"]) else ""
            topic_title_full = t_title + "." + topic_title_full 
            topic_parent = r["topic_parent"]
            topic_level = r["topic_level"]
            break

    return topic_title_full, topic_full


def refresh_topic(topics):

    df_topics = topics
    print(f"Before expand topic full ...")
    print(df_topics.head(100))

    topic_title_full = []
    topic_full = []

#     for index, row in tqdm(df_topics.iterrows(), total=df_topics.shape[0]):
    for index, row in df_topics.iterrows():
        t_title_full, t_full = get_parents(df_topics, row)
        topic_title_full.append(t_title_full)
        topic_full.append(t_full)
        if (index % 10000 == 0):
            print(f"processing df_topics: \n{index}, {row}")

    df_topics['topic_title_full'] = topic_title_full
    df_topics['topic_full'] = topic_full

    df_topics.to_csv(Topic_Full_Data_File) 
    print(f"Finished processing df_tpocs, and saved to {Topic_Full_Data_File}")
    return df_topics

if (Refresh_Topic):
    print("Freshing topic...")
    df_topics = refresh_topic(topics)
else:
    print(f"Load df_topcis from {Topic_Full_Data_File}")
    df_topics = pd.read_csv(Topic_Full_Data_File)

In [None]:
print(f"After expand topic full ...")
print(df_topics.head(100))
print(f"df_topics value counet for each columns: \n{df_topics.nunique()}")

Load train data by combining correlation table with topic and content tables.

In [None]:
def load_train_data(topics):
    train_df_columns = ["topic_title", "content_title", "topic_title_full", "topic_full", "topic_id","content_id", "content_description", "content_text" ]
    
    correlations["content_id"] = correlations["content_ids"].str.split(" ")
    corr = correlations.explode("content_id").drop(columns=["content_ids"])

    corr = corr.merge(df_topics, how="left", on="topic_id")
    corr = corr.merge(content, how="left", on="content_id")

    train_df = pd.DataFrame(corr[train_df_columns])
    cols = ['content_title', 'content_description', "content_text"]
    train_df['content_full'] = train_df[cols].apply(lambda row: '\r\n'.join(row.values.astype(str)), axis=1)
    
    final_train_data = pd.DataFrame(train_df)
    
    print(f"final_train_data value counet for each columns: \n{final_train_data.nunique()}")

    final_train_data.to_csv(Train_Data_File)
    
    return final_train_data

In [None]:
if Refresh_Train_Data:
    print(f"Refresh_Train_Data ==>>>")
    final_train_data = load_train_data(topics)
else:
    print(f"load final_train_data from {Train_Data_File}")
    final_train_data = pd.read_csv(Train_Data_File)

In [None]:
dataset_dict = load_dataset("csv", data_files=Train_Data_File)
print(dataset_dict)

dataset = dataset_dict["train"]

dataset = dataset.train_test_split(test_size=0.2)

train_dataset = dataset["train"]

keeped_columns = ["topic_full", "content_full"]
columns = train_dataset.column_names
columns_to_keep = ["topic_full", "content_full"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
train_dataset = train_dataset.remove_columns(columns_to_remove)

# train_dataset = train_dataset.shuffle(seed=42).select(range(10000))

train_examples = []
eval_examples = []

n_examples = train_dataset.num_rows

for dt in train_dataset:
    # print([dt["topic_full"], dt["content_full"]])
    # train_examples.append(InputExample(texts=[str(dt["topic_full"]), str(dt["content_full"])]))
    first = str(dt["topic_full"]) if dt["topic_full"] != "" else "" 
    second = str(dt["content_full"]) if str(dt["content_full"]) != "" else "" 
    if (first != "" and second != ""):
        train_examples.append(InputExample(texts=[first, second]))

print(f"train_examples: {len(train_examples)}")

In [None]:
def train_model():
    
    # model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
    print(f"load base model from {Base_Model_File}")
    model = SentenceTransformer(Base_Model_File)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)
    train_loss = losses.MultipleNegativesRankingLoss(model=model)
    # num_epochs = 5
    num_epochs = 1
    warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data
    print(f"model  to {device}")
    model.to(device)
    gc.collect()
    torch.cuda.empty_cache()

    # training_args = TrainingArguments(
    #     disable_tqdm=True,
    #     output_dir='./checkpoints',
    #     save_total_limit=10,
    #     logging_dir='/content/logs',
    #     num_train_epochs=num_epochs,
    #     evaluation_strategy='epoch'
    #     save_strategy='steps',
    #     save_steps=30,
    #     logging_steps=10,
    #     overwrite_output_dir=True,
    #     per_device_train_batch_size=4,
    #     per_device_eval_batch_size=4,
    #     gradient_accumulation_steps=4,
    #     eval_accumulation_steps=4,
    #     gradient_checkpointing=True,
    #     max_grad_norm=0.5,
    #     lr_scheduler_type="cosine",
    #     learning_rate=1e-4,
    #     warmup_ratio=0.05,
    #     weight_decay=0.1,
    #     fp16_full_eval=True
    #     fp16=True,
    #     fp16_opt_level='O1'
    # )
    print(f"start model fine tune")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            epochs=num_epochs,
            warmup_steps=warmup_steps)

    gc.collect()
    torch.cuda.empty_cache()

    model.save(Tuned_Model_File)
    print(f"model saved to {Tuned_Model_File}")

In [None]:
if Train_model:
    train_model()

In [None]:
tokenizer = AutoTokenizer.from_pretrained(Tuned_Model_File)
trained_model = AutoModel.from_pretrained(Tuned_Model_File)

trained_model.to(device)

In [None]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = trained_model(**encoded_input)
    return cls_pooling(model_output)

In [None]:
# build embedding based on topic_full and content_full, we also need keep topic_id and content_id.
from datasets import Dataset
def embeddings_gen(row):
    embedding_row = {}
    embedding_row["embeddings"]=get_embeddings( str(x["topic_full"])).detach().cpu().numpy()[0]
    embedding_row["content_id"] = str(x["content_id"])
    embedding_ds_raw.append(embedding_row)



def build_embedding():

    # def add_embeddings_topic_full(x):
    #     y = {}
    #     y["content_id"] = x["content_id"]
    #     y["embeddings"]=add_embeddings_topic_full( str(x["topic_full"])).detach().cpu().numpy()[0]
    #     embedding_list.append
    #     return x

    # def add_embeddings_content_full(x):
    #     y = {}
    #     y["content_id"] = x["content_id"]
    #     y["embeddings"]=add_embeddings_topic_full( str(x["content_full"])).detach().cpu().numpy()[0]
    #     embedding_list.append
    #     return x

    dataset_dict = load_dataset("csv", data_files=Train_Data_File,streaming=True)
    print(dataset_dict)
    dataset = dataset_dict["train"]
    embedding_data = {"content_id":[], "embeddings": []}
    count = 0
    for x in dataset:
        count += 1
        if (count %10000 == 0):
            print(f"{count} {x['content_id']}")
        y = {}
        embedding_data["content_id"].append(x["content_id"])
        embedding_data["embeddings"].append(get_embeddings( str(x["topic_full"])).detach().cpu().numpy()[0])
        embedding_data["content_id"].append(x["content_id"])
        embedding_data["embeddings"].append(get_embeddings( str(x["content_full"])).detach().cpu().numpy()[0])
         

    embedding_dataset = Dataset.from_dict(embedding_data)
    
#     embedding_list = {}

    # print(f"building embeddings for topic_full")
    # dataset.map(add_embeddings_topic_full)

    # print(f"building embeddings for content_full")
    # dataset.map(add_embeddings_topic_full)

    # embedding_dataset =  concatenate_datasets([embeddings_topics_dataset, embeddings_content_dataset])
    print(f"embedding_dataset => {embedding_dataset}")
    embedding_dataset.save_to_disk(Embeddings_File)

In [None]:
print(transformers.__version__)
print(datasets.__version__)
if Build_Embedding:
    print(f"Build Embedding .... from {Train_Data_File}")
    build_embedding()

embedding_dataset = load_from_disk(Embeddings_File)

embedding_dataset.add_faiss_index(column="embeddings")
print(f"finishing loading embedding_dataset")
print(embedding_dataset)

In [None]:
def get_cosine_sim(ebd, text_embedding):
    cosine_score = util.pytorch_cos_sim(text_embedding, ebd)
    return cosine_score.item()

def get_score_topic(text, cosine_cutoff):
    # There are only two filed in embedding data set: "embedding", "content_id"
    text_embedding = get_embeddings(text).cpu().detach().numpy()
    scores, samples = embedding_dataset.get_nearest_examples(
                                                    "embeddings", text_embedding, k=Nearest_K
                                                    )
    # print(scores)
    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=True, inplace=True)
    
    samples_df['cosine_sim'] = samples_df.apply(lambda row: get_cosine_sim(row["embeddings"], text_embedding), axis=1)
    samples_df = samples_df[samples_df['cosine_sim'] >= cosine_cutoff] 

    scorelist = samples_df["scores"].values.tolist()
    cosine_sim_list = samples_df["cosine_sim"].values.tolist()
    result_df = samples_df

    return result_df

def get_score_topic_json(text, cosine_cutoff):

    res_df = get_score_topic(text, cosine_cutoff)
    res_dict = res_df.to_dict('records')
    return res_dict

In [None]:
def calculate_row(row):
    res_dict = get_score_topic_json(str(row["topic_full"]), Cosine_Cutoff) 
    # print(f"calculate_row dict")
    # print(res_dict)
    #scores, topic_ids, content_ids, cosine_sim
    # result.append([scores, topic_id, topic_title, **evalrow ])
    row["scores"] = " ".join([f"{d['scores']:.10f}" for d in res_dict])
    row["content_ids"] = " ".join([f"{d['content_id']}" for d in res_dict])
    row["cosine_sim"] = " ".join([f"{d['cosine_sim']:.10f}" for d in res_dict])
    return row

def calculate_score(eval_sampled, res_file):
    num_rows = eval_sampled.num_rows
    scores_column = [-2] * num_rows
    content_ids_column = [""] * num_rows
    cosine_column = [-1.0] * num_rows
    eval_sampled = eval_sampled.add_column("scores", scores_column)
    eval_sampled = eval_sampled.add_column("content_ids", content_ids_column)
    eval_sampled = eval_sampled.add_column("cosine_sim", cosine_column)

    all_columns = set(eval_sampled.column_names)
    # keeped_columns = ["topic_id","topic_full", "content_full","content_id"]
    keeped_columns = ["topic_id","scores", "cosine_sim", "content_ids"]
    columns_to_removed = list(all_columns.symmetric_difference(keeped_columns))

    print(f"calculating scores mapping ...")
    eval_sampled = eval_sampled.map(calculate_row)
    print(f"result_df value counet for each columns: \n{eval_sampled.shape}")
    eval_sampled.to_csv(res_file) 
    

# Result_file = f"./data/result_v{dver}.csv"
# if Calculate_Score:
#     eval_sampled =  dataset["test"].shuffle(seed=42).select(range(2000))
#     calculate_score(eval_sampled, Result_file)
#     Result_file = f"./data/result_train_v{dver}.csv"
#     eval_sampled =  dataset["train"].shuffle(seed=42).select(range(2000))
#     calculate_score(eval_sampled, Result_file)

# calc_df = pd.read_csv(Result_file) 
# print(f"load result file {Result_file}")
# print(calc_df.head(100))

In [None]:
def get_neighbors(topic_id, df_topics):
    subset = df_topics.loc[df_topics['topic_id'] == topic_id]
    text = None
    for index, r in subset.iterrows():
            text = r["topic_full"]
            break
    if text is None:
        return ""

    res_dict = get_score_topic_json(text, Cosine_Cutoff) 

    content_ids = " ".join([f"{d['content_id']}" for d in res_dict])
    return content_ids


def calculate_submission(submission_df):

    submission_df['content_ids'] = submission_df.apply(lambda row: get_neighbors(row["topic_id"], df_topics), axis=1)
    submission_df.to_csv(DATA_PATH + "submission.csv") 
    submission_df.to_csv(Submission_File) 
    print(f"display submission head")
    print(submission_df.head(100))

In [None]:
if (Cal_Submission):
    print(f"Calculating submission ")
    submission_df = pd.read_csv(DATA_PATH + "sample_submission.csv")
    calculate_submission(submission_df)

# from sklearn.metrics import fbeta_score
# fbeta_score(y_true, y_pred, average='macro', beta=2)