# Pip

In [None]:
# ref https://www.sbert.net/examples/unsupervised_learning/SimCSE/README.html, https://www.kaggle.com/code/andtaichi/finetunig-sentencetransformer/notebook
!pip install -U sentence-transformers

# Import

In [None]:
import gc
import os

import math
import pandas as pd
import numpy as np
import torch.nn as nn

from datasets import Dataset
from sentence_transformers import SentenceTransformer, InputExample
from sentence_transformers import models, losses
from torch.utils.data import DataLoader

from pytorch_lightning import seed_everything

from tqdm import tqdm
tqdm.pandas()

# Config

In [None]:
cfg = {
    "general": {
        "input_path": "/kaggle/input/learning-equality-curriculum-recommendations", 
        "output_path": "./output",
        "seed": 42,
        "fold_list": [0],
        "cv": True,
    },
    
    "bi_encoder": {
        "model_name": "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
        "max_length": 128,
        "batch_size": 128,
        "gradient_checkpointing": False,
        "pooling": "mean",
        "warmup_ratio": 0.1,
        "epoch": 5,
        "lr": 5e-5,
        "weight_decay": 1e-2,
        "freeze_layers": 6,
        "reinit_layers": 1,
    },
}

# Function

In [None]:
# define some helper functions and classes to aid with data traversal

def print_markdown(md):
    display(Markdown(md))

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" | ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title
    
    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" | ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs

In [None]:
content_df = pd.read_csv(f"{cfg['general']['input_path']}/content.csv", index_col=0)
correlations_df = pd.read_csv(f"{cfg['general']['input_path']}/correlations.csv", index_col=0)
topics_df = pd.read_csv(f"{cfg['general']['input_path']}/topics.csv", index_col=0)
sub_df = pd.read_csv(f"{cfg['general']['input_path']}/sample_submission.csv")
fold = pd.read_csv(f"/kaggle/input/lecr-cv-4/fold.csv")

# fillna titles
content_df["title"].fillna("", inplace = True)
topics_df["title"].fillna("", inplace = True)
# fillna descriptions
content_df["description"].fillna("", inplace = True)
topics_df["description"].fillna("", inplace = True)
# fillna text
content_df["text"].fillna("", inplace=True)

In [None]:
def get_breadcrumbs(row):
    topic = Topic(row.name)
    breadcrumbs = topic.get_breadcrumbs()
    row["breadcrumbs"] = breadcrumbs
    return row

def get_topic_inputs(row):
    row["topic_inputs"] = row["topic_title"] + " [T_SEP] " + row["topic_breadcrumbs"] + " [T_SEP] " + row["topic_description"]
    return row

def get_content_inputs(row):
    row["content_inputs"] = row["content_title"] + " [C_SEP] " + row["content_description"] + " [C_SEP] " + row["content_text"].split("\n")[0]
    return row

In [None]:
topics_df["breadcrumbs"] = np.nan
topics_df = topics_df.progress_apply(get_breadcrumbs, axis=1)

In [None]:
topics_df["id"] = topics_df.index
topics_df = topics_df.reset_index(drop=True) 

content_df["id"] = content_df.index
content_df = content_df.reset_index(drop=True)

correlations_df["topic_id"] = correlations_df.index
correlations_df = correlations_df.reset_index(drop=True) 

In [None]:
def reinit_layer(model):
    print(f"reinit layer -> {cfg['bi_encoder']['reinit_layers']}")
    for layer in model.encoder.layer[
        -cfg["bi_encoder"]["reinit_layers"] :
    ]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(
                    mean=0.0, std=model.config.initializer_range
                )
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(
                    mean=0.0, std=model.config.initializer_range
                )
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

In [None]:
def train(train_sentences, fold_n):
    model_name = cfg["bi_encoder"]["model_name"]
    word_embedding_model = models.Transformer(
        cfg["bi_encoder"]["model_name"],
        max_seq_length=cfg["bi_encoder"]["max_length"],
        model_args={"gradient_checkpointing": cfg["bi_encoder"]["gradient_checkpointing"]})
    tokens = ["[T_SEP]", "[C_SEP]"]
    word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
    word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), cfg["bi_encoder"]["pooling"])
    
    # freeze
    if cfg["bi_encoder"]["freeze_layers"] > 0:
        print(f"freeze layer -> {cfg['bi_encoder']['freeze_layers']}")
        word_embedding_model.auto_model.embeddings.requires_grad_(False)
        word_embedding_model.auto_model.encoder.layer[:6].requires_grad_(False)
    
    # reinit some layers
    if cfg["bi_encoder"]["reinit_layers"] > 0:
        reinit_layer(word_embedding_model.auto_model)
        
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # DataLoader to batch your data
    train_dataloader = DataLoader(train_sentences, batch_size=cfg["bi_encoder"]["batch_size"], shuffle=True, drop_last=True)

    # Use the denoising auto-encoder loss
    train_loss = losses.MultipleNegativesRankingLoss(model)

    warmup_steps = math.ceil(len(train_dataloader) * cfg["bi_encoder"]["warmup_ratio"])
    # Call the fit method
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        epochs=cfg["bi_encoder"]["epoch"],
        warmup_steps=warmup_steps,
        optimizer_params={"lr": cfg["bi_encoder"]["lr"]},
        weight_decay=cfg["bi_encoder"]["weight_decay"],
        show_progress_bar=True,
        use_amp=True,
    )

    model.save(f"{cfg['general']['output_path']}/simcse-model_fold{fold_n}")

In [None]:
if cfg["general"]["cv"]:
    for fold_n in cfg["general"]["fold_list"]:
        seed_everything(cfg["general"]["seed"], workers=True)
        topics_df_fold = topics_df.merge(fold, how="inner", left_on="id", right_on="topic_id")
        topics_df_fold = topics_df_fold.drop(["topic_id"], axis=1)
        topics_df_fold = topics_df_fold[topics_df_fold["fold"]!=fold_n]

        topics_df_fold.rename(columns=lambda x: "topic_" + x, inplace=True)
        content_df.rename(columns=lambda x: "content_" + x, inplace=True)

        correlations = correlations_df.copy()
        correlations.content_ids = correlations.content_ids.str.split()
        correlations = correlations.explode("content_ids").rename(columns={"content_ids": "content_id"})
        correlations = correlations.merge(topics_df_fold, how="inner", on="topic_id")
        correlations = correlations.merge(content_df, how="left", on="content_id")
        correlations["topic_inputs"] = np.nan
        correlations["content_inputs"] = np.nan
        correlations = correlations.progress_apply(get_topic_inputs, axis=1)
        correlations = correlations.progress_apply(get_content_inputs, axis=1)
        correlations["set"] = correlations[["topic_inputs", "content_inputs"]].values.tolist()
        train_df = pd.DataFrame(correlations[["set"]])

        dataset = Dataset.from_pandas(train_df)
        train_sentences = []
        set_v = dataset["set"]
        n_examples = dataset.num_rows
        for i in range(n_examples):
            _set = set_v[i]
            train_sentences.append(InputExample(texts=[str(_set[0]), str(_set[1])]))

        train(train_sentences, fold_n)
else:
    seed_everything(42, workers=True)
    fold_n = "_all"
    topics_df.rename(columns=lambda x: "topic_" + x, inplace=True)
    content_df.rename(columns=lambda x: "content_" + x, inplace=True)

    correlations = correlations_df.copy()
    correlations.content_ids = correlations.content_ids.str.split()
    correlations = correlations.explode("content_ids").rename(columns={"content_ids": "content_id"})
    correlations = correlations.merge(topics_df, how="inner", on="topic_id")
    correlations = correlations.merge(content_df, how="left", on="content_id")
    correlations["topic_inputs"] = np.nan
    correlations["content_inputs"] = np.nan
    correlations = correlations.progress_apply(get_topic_inputs, axis=1)
    correlations = correlations.progress_apply(get_content_inputs, axis=1)
    correlations["set"] = correlations[["topic_inputs", "content_inputs"]].values.tolist()
    train_df = pd.DataFrame(correlations[["set"]])

    dataset = Dataset.from_pandas(train_df)
    train_sentences = []
    set_v = dataset["set"]
    n_examples = dataset.num_rows
    for i in range(n_examples):
        _set = set_v[i]
        train_sentences.append(InputExample(texts=[str(_set[0]), str(_set[1])]))

    train(train_sentences, fold_n)

In [None]:
train_df

In [None]:
train_df["set"][0]

In [None]:
import torch
gc.collect()
torch.cuda.empty_cache()