# import

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=0c2466742acdd35b055a7436640558e1e782a7b923c863d936cedc8397d5b4f8
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2
[0m

In [2]:
import gc
import json
import os
import math

import cupy as cp
import numpy as np
import pandas as pd
import pytorch_lightning as pl
import joblib
import sklearn.metrics as metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
from cuml.svm import SVC
from imblearn.ensemble import BalancedBaggingClassifier
from imblearn.under_sampling import RandomUnderSampler
from pytorch_lightning import Trainer
from pytorch_lightning import seed_everything
from pytorch_lightning.loggers import WandbLogger
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.model_selection import train_test_split
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
)
from torch.utils.data import DataLoader
from torch.nn import Sigmoid

from tqdm import tqdm
tqdm.pandas()

from sentence_transformers import SentenceTransformer, LoggingHandler
from sentence_transformers import models, util, datasets, evaluation, losses
from sentence_transformers import InputExample
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator

# Config

In [3]:
cfg = {
    "general": {
        #"project_name": "LECR",
        "input_path": "/kaggle/input",
        "output_path": "./output",
        "seed": 42,
        "fold": [0],
        "cv": True,
    },
    
    "task": {
        #"train_select_top_n_pos": 100,
        "train_select_top_n_neg": 100,
        "valid_select_top_n": 50,
        "undersampling": True,
    },
    
    "cross_encoder": {
        "model_name": "/kaggle/input/lecr-retriever-fold0-model/alllang",
        "max_length": 128,
        "batch_size": 64,
        "gradient_checkpointing": False,
        "warmup_ratio": 0.1,
        "epoch": 5,
        "lr": 5e-5,
        "weight_decay": 1e-2,
        "freeze_layers": 0,
        "reinit_layers": 0,
    },
}

## Read Data

In [4]:
correlations_df = pd.read_csv(f"{cfg['general']['input_path']}/learning-equality-curriculum-recommendations/correlations.csv")

# Function

In [5]:
def get_positive_corr(fold_n):
    content_df = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/content.pkl")
    topics_df = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/topics.pkl")
    fold = pd.read_pickle(f"{cfg['general']['input_path']}/lecr-datapkl/fold.pkl")
    
    topics_df_fold = topics_df.merge(fold, how="inner", left_on="id", right_on="topic_id")
    topics_df_fold = topics_df_fold.drop(["topic_id"], axis=1)
    topics_df_fold = topics_df_fold[topics_df_fold["fold"]!=fold_n]

    topics_df_fold.rename(columns=lambda x: "topic_" + x, inplace=True)
    content_df.rename(columns=lambda x: "content_" + x, inplace=True)

    correlations = correlations_df.copy()
    correlations.content_ids = correlations.content_ids.str.split()
    correlations = correlations.explode("content_ids").rename(columns={"content_ids": "content_id"})
    correlations = correlations.merge(topics_df_fold, how="inner", on="topic_id")
    correlations = correlations.merge(content_df, how="inner", on="content_id")
    
    correlations.rename(columns={"content_content_inputs" : "content_inputs"}, inplace=True)
    correlations.rename(columns={"topic_topic_inputs" : "topic_inputs"}, inplace=True)
    
    return correlations

In [6]:
def get_score_using_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in tqdm(np.arange(0.0, 1.0, 0.01)):
        x_val["predictions"] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val["predictions"] == 1]
        x_val1 = x_val1.groupby(["topic_ids"])["content_ids"].unique().reset_index()
        x_val1["content_ids"] = x_val1["content_ids"].apply(lambda x: ' '.join(x))
        x_val1.columns = ["topic_id", "predictions"]
        x_val0 = pd.Series(x_val["topic_ids"].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1["topic_id"])]
        x_val0 = pd.DataFrame({"topic_id": x_val0.values, "predictions": ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = "left", on = "topic_id")
        score = f2_score(x_val_r["content_ids"], x_val_r["predictions"])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold

def get_score(x_val, val_predictions, correlations):
    #x_val["predictions"] = np.where(val_predictions > thres, 1, 0)
    x_val["predictions"] = val_predictions
    x_val1 = x_val[x_val["predictions"] == 1]
    x_val1 = x_val1.groupby(["topic_ids"])["content_ids"].unique().reset_index()
    x_val1["content_ids"] = x_val1["content_ids"].apply(lambda x: ' '.join(x))
    x_val1.columns = ["topic_id", "predictions"]
    x_val0 = pd.Series(x_val["topic_ids"].unique())
    x_val0 = x_val0[~x_val0.isin(x_val1["topic_id"])]
    x_val0 = pd.DataFrame({"topic_id": x_val0.values, "predictions": ""})
    x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
    x_val_r = x_val_r.merge(correlations, how = "left", on = "topic_id")
    score = f2_score(x_val_r["content_ids"], x_val_r["predictions"])
    return score

In [7]:
def reinit_layer(model):
    print(f"reinit layer -> {cfg['cross_encoder']['reinit_layers']}")
    for layer in model.encoder.layer[
        -cfg["cross_encoder"]["reinit_layers"] :
    ]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(
                    mean=0.0, std=model.config.initializer_range
                )
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(
                    mean=0.0, std=model.config.initializer_range
                )
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

In [8]:
def make_inputs(df):
    inputs = []
    
    for content_input, topic_input, target in df[["content_inputs", "topic_inputs", "target"]].values:
        inputs.append(
            InputExample(texts=[topic_input, content_input], label=target),
        )
    
    return inputs

In [9]:
class BalancedBatchUnderSampler(torch.utils.data.sampler.Sampler):
    def __init__(
        self, labels, batch_size,
    ):
        self.labels = labels

        label_counts = np.bincount(labels)
        major_label = label_counts.argmax()
        minor_label = label_counts.argmin()

        self.major_indices = np.where(labels == major_label)[0]
        self.minor_indices = np.where(labels == minor_label)[0]

        np.random.shuffle(self.major_indices)
        np.random.shuffle(self.minor_indices)

        self.used_indices = 0
        self.count = 0
        self.batch_size = batch_size
        self.n_samples = batch_size // 2

    def __iter__(self):
        np.random.shuffle(self.major_indices)
        np.random.shuffle(self.minor_indices)
        self.count = 0
        self.used_indices = 0
        while self.count + self.n_samples <= len(self.minor_indices):
            indices = (
                self.minor_indices[
                    self.used_indices : self.used_indices + self.n_samples
                ].tolist()
                + np.random.choice(
                    self.major_indices, self.n_samples, replace=False
                ).tolist()
            )
            np.random.shuffle(indices)
            yield indices

            self.used_indices += self.n_samples
            self.count += self.n_samples

    def __len__(self):
        return len(self.minor_indices) // self.n_samples

In [10]:
def train_and_predict(
    cfg,
    fold,
    train,
    valid,
):  
    # under sampling
    #rus = RandomUnderSampler(random_state=cfg["general"]["seed"])
    #train_X, train_y = train_X.reshape(-1, 1), train_y.reshape(-1, 1)
    #train, _ = rus.fit_resample(train, train["target"])
    #train_X, train_y = train_X.flatten(), train_y.flatten()
    
    train_inputs = make_inputs(train)
    valid_inputs = make_inputs(valid)
    
    if cfg["task"]["undersampling"]:
        batch_sampler = BalancedBatchUnderSampler(train["target"].values, cfg["cross_encoder"]["batch_size"])
        train_dataloader = DataLoader(train_inputs, batch_sampler=batch_sampler)
    else:
        train_dataloader = DataLoader(train_inputs, shuffle=True, batch_size=cfg["cross_encoder"]["batch_size"])
        
    evaluator = CEBinaryClassificationEvaluator.from_input_examples(valid_inputs, show_progress_bar=True)
    
    warmup_steps = math.ceil(len(train_dataloader) * cfg["cross_encoder"]["epoch"] * 0.1)
    model = CrossEncoder(cfg["cross_encoder"]["model_name"], num_labels=1, max_length=cfg["cross_encoder"]["max_length"])
    tokens = ["[T_SEP]", "[C_SEP]"]
    model.tokenizer.add_tokens(tokens, special_tokens=True)
    model.model.resize_token_embeddings(len(model.tokenizer))
    
    # freeze
    if cfg["cross_encoder"]["freeze_layers"] > 0:
        print(f"freeze layer -> {cfg['cross_encoder']['freeze_layers']}")
        model.model.base_model.embeddings.requires_grad_(False)
        model.model.base_model.encoder.layer[:cfg["cross_encoder"]["freeze_layers"]].requires_grad_(False)
    
    # reinit some layers
    if cfg["cross_encoder"]["reinit_layers"] > 0:
        reinit_layer(model.model.base_model)
    
    model.fit(
        train_dataloader=train_dataloader,
        evaluator=evaluator,
        epochs=cfg["cross_encoder"]["epoch"],
        warmup_steps=warmup_steps,
        output_path=f"{cfg['general']['output_path']}/fold{fold}/",
        use_amp=True,
        save_best_model=False,
    )
    model.save(f"{cfg['general']['output_path']}/crossencoder-model_fold{fold}")

    if valid is None:
        return
    else:
        predictions = model.predict(
            sentences=valid[["topic_inputs", "content_inputs"]].values,
            activation_fct=Sigmoid(),
            batch_size=cfg["cross_encoder"]["batch_size"],
        )
        return predictions

In [11]:
def one_fold(cfg, fold_n):
    print(f"[fold_{fold_n}] start")
    seed_everything(cfg["general"]["seed"], workers=True)
    
    train_neg = pd.read_pickle(f"/kaggle/input/lecr-knn-output-fold{fold_n}/train_fold_k{cfg['task']['train_select_top_n_neg']}_{fold_n}.pkl")
    train_pos = get_positive_corr(fold_n)
    train_pos["target"] = 1
    train_neg = train_neg[train_neg["target"]==0].reset_index(drop=True)
    train = pd.concat([train_pos, train_neg])
    del train_pos, train_neg
    gc.collect()
    print("read train data")
    valid = pd.read_pickle(f"/kaggle/input/lecr-knn-output-fold{fold_n}/valid_fold_k{cfg['task']['valid_select_top_n']}_{fold_n}.pkl")
    print("read valid data")
    
    print("train_shape:", train.shape)
    print(train["target"].value_counts())
    print("valid_shape:", valid.shape)
    print(valid["target"].value_counts())

    # train and valid
    valid_preds = train_and_predict(
        cfg,
        fold_n,
        train,
        valid,
    )
    print(valid_preds)
    
    # metrics
    score, threshold = get_score_using_threshold(valid, valid_preds, correlations_df)
    print(f"Score: {score}")
    print(f"Threshold: {threshold}")
    valid_preds = np.where(valid_preds > threshold, 1, 0)
    accuracy = accuracy_score(valid["target"].values, valid_preds)
    recall = recall_score(valid["target"].values, valid_preds)
    specificity = recall_score(valid["target"].values, valid_preds, pos_label=0)
    precision = precision_score(valid["target"].values, valid_preds)
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")
    print(f"Specificity: {specificity}")
    print(f"Precision: {precision}")
    results = pd.read_csv(f"{cfg['general']['output_path']}/fold{fold_n}/CEBinaryClassificationEvaluator_results.csv").drop(columns=["steps"])
    display(results)

    torch.cuda.empty_cache()

    return score, threshold, accuracy, recall, specificity, precision

In [12]:
def all_train(cfg, train):
    print("[all_train] start")
    seed_everything(cfg["general"]["seed"], workers=True)
    
    # train
    train_and_predict(cfg, train)

    return

In [13]:
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

def get_pos_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

## Main

In [14]:
# random seed setting
seed_everything(cfg["general"]["seed"], workers=True)

if cfg["general"]["cv"]:
    score_list = []
    threshold_list = []
    accuracy_list = []
    recall_list = []
    specificity_list = []
    predicision_list = []
    for fold_n in tqdm(cfg["general"]["fold"]):
        cfg["fold_n"] = fold_n
        score, threshold, accuracy, recall, specificity, predicision = one_fold(cfg, fold_n)
        score_list.append(score)
        threshold_list.append(threshold)
        accuracy_list.append(accuracy)
        recall_list.append(recall)
        specificity_list.append(specificity)
        predicision_list.append(predicision)

    score_mean = np.mean(score_list, axis=0)
    threshold_mean = np.mean(threshold_list, axis=0)
    accuracy_mean = np.mean(accuracy_list, axis=0)
    recall_mean = np.mean(recall_list, axis=0)
    specificity_mean = np.mean(specificity_list, axis=0)
    predicision_mean = np.mean(predicision_list, axis=0)
    print(f"cv mean score:{score_mean}")
    print(f"cv mean threshold:{threshold_mean}")
    print(f"cv mean accuracy:{accuracy_mean}")
    print(f"cv mean recall:{recall_mean}")
    print(f"cv mean specificity:{specificity_mean}")
    print(f"cv mean predicision:{predicision_mean}")
else:
    # train all data
    cfg["fold_n"] = "all"
    all_train(cfg, train_X, train_y)

  0%|          | 0/1 [00:00<?, ?it/s]

[fold_0] start
read train data
read valid data
train_shape: (5682031, 26)
0    5421475
1     260556
Name: target, dtype: int64
valid_shape: (237450, 7)
0    222564
1     14886
Name: target, dtype: int64


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/lecr-retriever-fold0-model/alllang and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8142 [00:00<?, ?it/s]

Batches:   0%|          | 0/7421 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8142 [00:00<?, ?it/s]

Batches:   0%|          | 0/7421 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8142 [00:00<?, ?it/s]

Batches:   0%|          | 0/7421 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8142 [00:00<?, ?it/s]

Batches:   0%|          | 0/7421 [00:00<?, ?it/s]

Iteration:   0%|          | 0/8142 [00:00<?, ?it/s]

Batches:   0%|          | 0/7421 [00:00<?, ?it/s]

Batches:   0%|          | 0/3711 [00:00<?, ?it/s]

[0.91827816 0.9433855  0.9003012  ... 0.00880148 0.10910992 0.39402628]



  0%|          | 0/100 [00:00<?, ?it/s][A
  import sys

  2%|▏         | 2/100 [00:00<00:44,  2.23it/s][A
  3%|▎         | 3/100 [00:01<00:41,  2.34it/s][A
  4%|▍         | 4/100 [00:01<00:39,  2.44it/s][A
  5%|▌         | 5/100 [00:02<00:37,  2.53it/s][A
  6%|▌         | 6/100 [00:02<00:36,  2.59it/s][A
  7%|▋         | 7/100 [00:02<00:36,  2.57it/s][A
  8%|▊         | 8/100 [00:03<00:36,  2.55it/s][A
  9%|▉         | 9/100 [00:03<00:36,  2.52it/s][A
 10%|█         | 10/100 [00:04<00:37,  2.38it/s][A
 11%|█         | 11/100 [00:04<00:35,  2.50it/s][A
 12%|█▏        | 12/100 [00:04<00:33,  2.60it/s][A
 13%|█▎        | 13/100 [00:05<00:32,  2.67it/s][A
 14%|█▍        | 14/100 [00:05<00:31,  2.72it/s][A
 15%|█▌        | 15/100 [00:05<00:30,  2.78it/s][A
 16%|█▌        | 16/100 [00:06<00:29,  2.82it/s][A
 17%|█▋        | 17/100 [00:06<00:29,  2.86it/s][A
 18%|█▊        | 18/100 [00:06<00:28,  2.89it/s][A
 19%|█▉        | 19/100 [00:07<00:27,  2.91it/s][A
 20%|██       

Score: 0.4608
Threshold: 0.53
Accuracy: 0.8683428090124237
Recall: 0.6741233373639661
Specificity: 0.8813330098308801
Precision: 0.27533885748779013


Unnamed: 0,epoch,Accuracy,Accuracy_Threshold,F1,F1_Threshold,Precision,Recall,Average_Precision
0,0,0.944334,0.943473,0.41141,0.730435,0.40587,0.417103,0.398834
1,1,0.944607,0.961,0.451881,0.790652,0.443267,0.460836,0.44133
2,2,0.945896,0.95732,0.462763,0.785062,0.479605,0.447064,0.453232
3,3,0.945092,0.971654,0.462721,0.808141,0.450809,0.475279,0.456007
4,4,0.945319,0.967783,0.465432,0.827962,0.470121,0.460836,0.459807


100%|██████████| 1/1 [6:34:30<00:00, 23670.49s/it]

cv mean score:0.4608
cv mean threshold:0.53
cv mean accuracy:0.8683428090124237
cv mean recall:0.6741233373639661
cv mean specificity:0.8813330098308801
cv mean predicision:0.27533885748779013





In [15]:
torch.cuda.empty_cache()
gc.collect()

260