In [1]:
!pip install -q -U peft --no-index --find-links /kaggle/input/llm-detect-pip-rb
!pip install -q -U accelerate --no-index --find-links /kaggle/input/llm-detect-pip-rb
!pip install -q -U bitsandbytes --no-index --find-links /kaggle/input/llm-detect-pip-rb
!pip install -q -U transformers --no-index --find-links /kaggle/input/llm-detect-pip-rb
!pip install -q /kaggle/input/sparse-dot-topn-033/sparse_dot_topn-0.3.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [3]:
import os

os.makedirs("./outputs", exist_ok=True)
os.makedirs("./configs", exist_ok=True)

In [4]:
%%writefile run_llm_inference.py

import sys

sys.path.insert(0, '/kaggle/input/omegaconf')
sys.path.insert(0, '/kaggle/input/utils-ai-v3')

import argparse
import os

import pandas as pd
import torch
from accelerate import Accelerator
from omegaconf import OmegaConf
from peft import PeftModel

from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

from r_detect.ai_dataset import AiDataset
from r_detect.ai_loader import AiCollator, show_batch
from r_detect.ai_model import MistralForDetectAI

import re
import unicodedata


# pre-process -----
char_to_remove = ['{', '£', '\x97', '¹', 'å', '\\', '\x85', '<', '\x99', \
                  'é', ']', '+', 'Ö', '\xa0', '>', '|', '\x80', '~', '©', \
                  '/', '\x93', '$', 'Ó', '²', '^', ';', '`', 'á', '*', '(', \
                  '¶', '®', '[', '\x94', '\x91', '#', '-', 'ó', ')', '}', '=']



def preprocess_text(text, strategy='light'):
    assert strategy in ["none", "light", "heavy"], "pre-processing strategy must one of: none, light, heavy"
    
    if strategy == "none":
        text = text
        
    elif strategy == "light":
        text = text.encode("ascii", "ignore").decode('ascii')        
        text = text.strip()
        text = text.strip("\"")

        for c in char_to_remove:
            text = text.replace(c, "")

        if text[-1]!=".":
            text = text.split(".")
            text = ".".join(text[:-1])
            text += "."
    else:
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s.,;?!:()\'\"%-]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text


def run_inference(accelerator, model, infer_dl, example_ids):
    model.eval()
    all_predictions = []

    progress_bar = tqdm(range(len(infer_dl)), disable=not accelerator.is_local_main_process)

    for step, batch in enumerate(infer_dl):
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits.reshape(-1)
        predictions = torch.sigmoid(logits)
        predictions = accelerator.gather_for_metrics(predictions)
        predictions = predictions.cpu().numpy().tolist()

        all_predictions.extend(predictions)

        progress_bar.update(1)
    progress_bar.close()

    result_df = pd.DataFrame()
    result_df["id"] = example_ids
    result_df["generated"] = all_predictions

    return result_df

def main(cfg, save_dir, model_id):
    
    # create accelerator
    accelerator = Accelerator()
    
    # read test data
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')
        
    accelerator.print("~~"*40)
    accelerator.print(f"PRE-PROCESSING: {cfg.preprocess_strategy.upper()}")
    accelerator.print("~~"*40)

    test_df['text'] = test_df['text'].apply(lambda x: preprocess_text(x, cfg.preprocess_strategy))
    accelerator.print(f'Test csv shape: {test_df.shape}')
    
    with accelerator.main_process_first():
        dataset_creator = AiDataset(cfg)
        infer_ds = dataset_creator.get_dataset(test_df)
    
    tokenizer = dataset_creator.tokenizer
    # tokenizer.pad_token = tokenizer.eos_token
    
    infer_ds = infer_ds.sort("input_length")
    infer_ds.set_format(
        type=None,
        columns=[
            'id',
            'input_ids',
            'attention_mask',
        ]
    )
    
    infer_ids = infer_ds["id"]  # .tolist()
    
    #--
    data_collator = AiCollator(
        tokenizer=tokenizer,
        pad_to_multiple_of=64
    )

    infer_dl = DataLoader(
        infer_ds,
        batch_size=cfg.predict_params.per_device_eval_batch_size,
        shuffle=False,
        collate_fn=data_collator,
    )

    accelerator.print("data preparation done...")
    accelerator.print("~~"*40)
    accelerator.wait_for_everyone()
    
    
    #----------
    for b in infer_dl:
        break
    show_batch(b, tokenizer, task='infer', print_fn=accelerator.print)
    accelerator.print("~~"*40)
    #----------


    ## Load Model
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    
    base_model = MistralForDetectAI.from_pretrained(
        cfg.model.backbone_path,
        num_labels=cfg.model.num_labels,
        quantization_config=bnb_config,
        low_cpu_mem_usage=True
    )
    
    base_model.config.pretraining_tp = 1
    # base_model.config.pad_token_id = tokenizer.pad_token_id
    model = PeftModel.from_pretrained(base_model, cfg.model.lora_path)
    accelerator.print("### Loaded Model Weights ###")
    
    model, infer_dl = accelerator.prepare(model, infer_dl)
    
    # run inference ---
    sub_df = run_inference(accelerator, model, infer_dl, infer_ids)
    accelerator.wait_for_everyone()
    
    if accelerator.is_main_process:
        save_path = os.path.join(save_dir, f"{model_id}.parquet")
        sub_df.to_parquet(save_path)
        accelerator.print("done!")
        accelerator.print("~~"*40)
    
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    ap.add_argument('--save_dir', type=str, required=True)
    ap.add_argument('--model_id', type=str, required=True)

    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)

    os.makedirs(args.save_dir, exist_ok=True)

    # execution
    main(
        cfg,
        save_dir=args.save_dir,
        model_id=args.model_id,
    )


Overwriting run_llm_inference.py


In [5]:
%%writefile ./configs/conf_mistral_mix16.yaml

preprocess_strategy: light

model:
    backbone_path: "/kaggle/input/mistral-7b-v0-1/Mistral-7B-v0.1"
    lora_path: /kaggle/input/detect-ai-r-detect-v16-r8
    max_length: 1296
    num_labels: 1
    tokenizer:
        padding_side: left
        truncation_side: left
        use_fast: true

predict_params:
    per_device_eval_batch_size: 1



Writing ./configs/conf_mistral_mix16.yaml


In [6]:
%%writefile ./configs/conf_mistral_v26.yaml

preprocess_strategy: none

model:
    backbone_path: "/kaggle/input/mistral-7b-v0-1/Mistral-7B-v0.1"
    lora_path: /kaggle/input/detect-ai-r-detect-v26
    max_length: 1296
    num_labels: 1
    tokenizer:
        padding_side: left
        truncation_side: left
        use_fast: true

predict_params:
    per_device_eval_batch_size: 1

Writing ./configs/conf_mistral_v26.yaml


In [7]:
%%time
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_llm_inference.py \
--config_path "./configs/conf_mistral_mix16.yaml" \
--save_dir "./outputs" \
--model_id "m0"

2024-04-11 10:15:39.982959: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:15:39.982970: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:15:39.983026: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:15:39.983081: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:15:40.071965: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

In [8]:
%%time
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_llm_inference.py \
--config_path "./configs/conf_mistral_v26.yaml" \
--save_dir "./outputs" \
--model_id "m1"


2024-04-11 10:24:23.411775: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:24:23.411833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:24:23.413479: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-11 10:24:23.427159: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:24:23.427199: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

In [9]:
%%writefile run_deberta_inference_ub.py

import os
import torch
import argparse
import numpy as np
import pandas as pd
from datasets import Dataset
from scipy.special import expit
from transformers import (
    DataCollatorWithPadding, TrainingArguments,
    AutoTokenizer, AutoModelForSequenceClassification, Trainer
)

import gc
import torch

from accelerate import Accelerator
accelerator = Accelerator()

def preprocess_function(examples, max_length, tokenizer):
    tokenized_samples = tokenizer(examples["text"], truncation=True, max_length=max_length)
    return tokenized_samples

def main(args):
    # read test data
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')
        
    accelerator.print(f'Test csv shape: {test_df.shape}')
    test_ds = Dataset.from_pandas(test_df)

    ## Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)

    test_tokenized_ds = test_ds.map(preprocess_function, batched=True, 
                                    fn_kwargs={"max_length": args.max_length, "tokenizer": tokenizer},
                                    remove_columns=test_ds.column_names)
    
    for idx in range(2):
        accelerator.print(f"\n--- Sample {idx} ---\n")
        accelerator.print(tokenizer.decode(test_tokenized_ds[idx]["input_ids"]))

    ## Load Model

    model = AutoModelForSequenceClassification.from_pretrained(
        args.base_model_path,
        num_labels=1
    )
    model = accelerator.prepare(model)
    accelerator.print("### Loaded Model Weights ###")

    ## Trainer Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")
    training_args = TrainingArguments(output_dir="tmp", 
                                  per_device_eval_batch_size=1,
                                  remove_unused_columns=False)
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    ## predictor
    pred_output = trainer.predict(test_tokenized_ds)
    logits = pred_output.predictions.astype(float)
#     probs = expit(logits)[:, 0]
    probs = -1 * logits[:, 0]

    sub = pd.DataFrame({
        "id": test_df['id'].values,
        "generated": probs
    })
    save_path = os.path.join(args.save_dir, f"{args.model_id}.parquet")
    sub.to_parquet(save_path)

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--base_model_path', type=str, required=True)
    ap.add_argument('--max_length', type=int, required=True)
    ap.add_argument('--save_dir', type=str, default="./outputs")
    ap.add_argument('--model_id', type=str, required=True)
    args = ap.parse_args()
    
    main(args)

Writing run_deberta_inference_ub.py


In [10]:
%%time
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_deberta_inference_ub.py \
--base_model_path "/kaggle/input/deberta-v3-large-v18-margin/checkpoint-6250" \
--max_length 1024 \
--save_dir "./outputs" \
--model_id "m2"


2024-04-11 10:26:29.026637: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:26:29.026701: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:26:29.028174: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-11 10:26:29.033036: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:26:29.033080: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

In [11]:
%%writefile run_deberta_mlm.py
import sys
sys.path.insert(0, '/kaggle/input/omegaconf')

import argparse
import math
import os
import time
from copy import deepcopy
from dataclasses import dataclass
from itertools import chain
from typing import Optional

import bitsandbytes as bnb
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.checkpoint
from accelerate import Accelerator
from accelerate.utils import set_seed
from datasets import Dataset
from omegaconf import OmegaConf
from tokenizers import (Tokenizer, models, normalizers, pre_tokenizers,
                        processors, trainers)
from torch.utils.data import DataLoader
from tqdm import tqdm
from tqdm.auto import tqdm
from transformers import (DataCollatorForLanguageModeling, DebertaV2Config,
                          DebertaV2ForMaskedLM, PreTrainedTokenizerBase,
                          PreTrainedTokenizerFast, default_data_collator,
                          get_cosine_schedule_with_warmup)
from transformers.models.deberta_v2.modeling_deberta_v2 import \
    DebertaV2OnlyMLMHead
from transformers.trainer_pt_utils import get_parameter_names

# utils ---------------------------------------------------------------------------------#


def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm%ds' % (m, s)


def get_lr(optimizer):
    return optimizer.param_groups[0]['lr']*1e6


class AverageMeter(object):
    """Computes and stores the average and current value
       Imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262
    """

    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

# tokenizer class -----------------------------------------------------------------------#


class BPETokenizer:
    ST = ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]"]

    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.tok = Tokenizer(models.BPE(unk_token="[UNK]"))
        self.tok.normalizer = normalizers.Sequence([normalizers.NFC()])
        self.tok.pre_tokenizer = pre_tokenizers.ByteLevel()
        self.tok.post_processor = processors.TemplateProcessing(
            single="[CLS] $0 [SEP]",
            pair="[CLS] $A [SEP] $B:1 [SEP]:1",
            special_tokens=[("[CLS]", 1), ("[SEP]", 2)],
        )

    @classmethod
    def chunk_dataset(cls, dataset, chunk_size=1_000):
        for i in range(0, len(dataset), chunk_size):
            yield dataset[i: i + chunk_size]["text"]

    def train(self, data):
        trainer = trainers.BpeTrainer(vocab_size=self.vocab_size, special_tokens=self.ST)
        dataset = Dataset.from_pandas(data[["text"]])
        self.tok.train_from_iterator(self.chunk_dataset(dataset), trainer=trainer)
        return self

    def tokenize(self, data):
        tokenized_texts = []
        for text in tqdm(data['text'].tolist()):
            tokenized_texts.append(self.tok.encode(text))
        return tokenized_texts

    def get_fast_tokenizer(self, max_length):
        return PreTrainedTokenizerFast(
            tokenizer_object=self.tok,
            unk_token="[UNK]",
            pad_token="[PAD]",
            cls_token="[CLS]",
            sep_token="[SEP]",
            mask_token="[MASK]",
            model_max_length=max_length
        )


def tokenizer_test(tokenizer):
    print("=="*40)
    print(f"tokenizer len: {len(tokenizer)}")
    test_string = "This is a test \n\n!!"
    print(f"tokenizer test: {tokenizer.tokenize(test_string)}")
    print("=="*40)

# collator class ------------------------------------------------------------------------#


@dataclass
class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    tokenizer: PreTrainedTokenizerBase
    mlm: bool = True
    mlm_probability: float = 0.15
    pad_to_multiple_of: Optional[int] = None
    tf_experimental_compile: bool = False
    return_tensors: str = "pt"

    def torch_mask_tokens(self, inputs, special_tokens_mask):
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """

        labels = inputs.clone()

        # geometric distribution for spans
        geo_p, lower, upper = 0.15, 1, 6
        len_distrib = [geo_p * (1-geo_p)**(i - lower) for i in range(lower, upper + 1)]
        len_distrib = [x / (sum(len_distrib)) for x in len_distrib]
        lens = list(range(lower, upper + 1))

        masked_indices = []

        for ex_labels in labels:
            mask_num = math.ceil(len(ex_labels) * self.mlm_probability)
            ex_mask = set()
            while len(ex_mask) < mask_num:
                span_len = np.random.choice(lens, p=len_distrib)
                anchor = np.random.choice(len(ex_labels))
                if anchor in ex_mask:
                    continue
                else:
                    left1, right1 = anchor, min(anchor + span_len, len(ex_labels))
                    for i in range(left1, right1):
                        if len(ex_mask) >= mask_num:
                            break
                        ex_mask.add(i)
            ex_mask_bool = [i in ex_mask for i in range(len(ex_labels))]
            masked_indices.append(ex_mask_bool)
        masked_indices = torch.tensor(masked_indices).bool()

        if special_tokens_mask is None:
            special_tokens_mask = [
                self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
            ]
            special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
        else:
            special_tokens_mask = special_tokens_mask.bool()

        masked_indices = torch.logical_and(masked_indices, ~special_tokens_mask)
        labels[~masked_indices] = -100  # We only compute loss on masked tokens

        # 98% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
        indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.98)).bool() & masked_indices
        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)

        # 1% of the time, we replace masked input tokens with random word
        indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
        random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
        inputs[indices_random] = random_words[indices_random]
        # pdb.set_trace()

        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
        return inputs, labels


def show_batch(batch, tokenizer, num_examples=8, print_fn=print):
    print_fn('=='*40)
    num_examples = min(num_examples, len(batch['input_ids']))

    for i in range(num_examples):
        input_ids = batch['input_ids'][i]
        input_text = tokenizer.decode(input_ids, skip_special_tokens=False)
        print_fn(f"input text:\n {input_text}")
        print_fn('=='*40)

# dataset class -------------------------------------------------------------------------#


def get_mlm_dataset(cfg, notes_df, tokenizer):
    notes_df = notes_df[['text']].copy()
    notes_df = notes_df.reset_index(drop=True)

    task_dataset = Dataset.from_pandas(notes_df)

    def tokenize_function(examples):
        result = tokenizer(examples['text'])
        return result

    tokenized_datasets = task_dataset.map(
        tokenize_function, batched=True, remove_columns=task_dataset.column_names
    )

    chunk_size = cfg.max_length

    def group_texts(examples):
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        total_length = (total_length // chunk_size) * chunk_size

        result = {
            k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
            for k, t in concatenated_examples.items()
        }

        result["labels"] = result["input_ids"].copy()
        return result

    lm_datasets = tokenized_datasets.map(group_texts, batched=True)

    test_pct = cfg.test_pct

    max_train_examples = cfg.max_train_examples
    max_test_examples = int(max_train_examples * test_pct)

    test_size = int(len(lm_datasets) * test_pct)
    train_size = len(lm_datasets) - test_size

    test_size = min(test_size, max_test_examples)
    train_size = min(train_size, max_train_examples)

    downsampled_dataset = lm_datasets.train_test_split(
        train_size=train_size, test_size=test_size, seed=cfg.seed
    )
    data_collator = CustomDataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=cfg.mask_probability
    )

    def insert_random_mask(batch):
        features = [dict(zip(batch, t)) for t in zip(*batch.values())]
        masked_inputs = data_collator(features)
        return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

    downsampled_dataset["test"] = downsampled_dataset["test"].map(
        insert_random_mask,
        batched=True,
        remove_columns=downsampled_dataset["test"].column_names,
    )

    try:
        downsampled_dataset["test"] = downsampled_dataset["test"].rename_columns(
            {
                "masked_input_ids": "input_ids",
                "masked_attention_mask": "attention_mask",
                "masked_labels": "labels",
                "masked_token_type_ids": "token_type_ids",
            }
        )
    except Exception as e:
        downsampled_dataset["test"] = downsampled_dataset["test"].rename_columns(
            {
                "masked_input_ids": "input_ids",
                "masked_attention_mask": "attention_mask",
                "masked_labels": "labels",
            }
        )

    return downsampled_dataset

# main ----------------------------------------------------------------------------------#


def main(cfg):
    accelerator = Accelerator(
        gradient_accumulation_steps=cfg.gradient_accumulation_steps,
        mixed_precision='fp16',
    )

    def print_line():
        prefix, unit, suffix = "#", "~~", "#"
        accelerator.print(prefix + unit*50 + suffix)

    # set seed ----
    print_line()
    accelerator.print(f"setting seed: {cfg.seed}")
    set_seed(cfg.seed)

    if accelerator.is_main_process:
        os.makedirs(cfg.model_dir, exist_ok=True)
    print_line()

    # data ------------------------------------------------------------------------------#
    train_df = pd.read_csv(cfg.train_data_path).rename(columns={'full_text': 'text', 'essay_id_comp': 'id'})
    train_df = train_df[['id', 'text']].copy()
    
    # test_df = pd.read_csv(cfg.test_data_path)
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')
    
    
    test_df = test_df[['id', 'text']].copy()

    train_df = train_df.sample(frac=cfg.train_frac_for_mlm).reset_index(drop=True)

    if cfg.debug:
        n_debug = min(1000, len(train_df))
        train_df = train_df.sample(n_debug, random_state=cfg.seed).reset_index(drop=True)

    notes_df = pd.concat([train_df, test_df, test_df], axis=0).reset_index(drop=True)  # oversample test 2x
    accelerator.print(f"shape of input text data: {notes_df.shape}")
    print_line()

    # tokenizer -------------------------------------------------------------------------#
    tok_train_df = train_df.sample(
        frac=cfg.train_frac_for_tokenizer, random_state=cfg.seed
    ).reset_index(drop=True)
    accelerator.print(f"Train tokenizer with {len(tok_train_df)} train df samples")

    bpe_tok = BPETokenizer(cfg.vocab_size).train(
        pd.concat((tok_train_df, test_df)).reset_index(drop=True)
    )
    tokenizer = bpe_tok.get_fast_tokenizer(cfg.max_length)

    # dataset ---------------------------------------------------------------------------#
    with accelerator.main_process_first():
        mlm_dataset = get_mlm_dataset(cfg, notes_df, tokenizer)

    # model------------------------------------------------------------------------------#
    base_config = DebertaV2Config(
        attention_probs_dropout_prob=0.0,
        hidden_act="gelu",
        hidden_dropout_prob=0.0,
        hidden_size=768,
        initializer_range=0.02,
        intermediate_size=3072,
        max_position_embeddings=512,
        relative_attention=True,
        position_buckets=256,
        norm_rel_ebd="layer_norm",
        share_att_key=True,
        pos_att_type="p2c|c2p",
        layer_norm_eps=1e-7,
        max_relative_positions=-1,
        position_biased_input=False,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=0,
        vocab_size=tokenizer.vocab_size,
    )

    model = DebertaV2ForMaskedLM(base_config)
    model.deberta.resize_token_embeddings(len(tokenizer))
    model.cls = DebertaV2OnlyMLMHead(base_config)

    # optimizer -------------------------------------------------------------------------#
    decay_parameters = get_parameter_names(model, [nn.LayerNorm])
    decay_parameters = [name for name in decay_parameters if "bias" not in name]

    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
            "weight_decay": cfg.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
            "weight_decay": 0.0,
        },
    ]

    accelerator.print("using bnb optimizer....")

    optimizer = bnb.optim.Adam8bit(
        optimizer_grouped_parameters, lr=cfg.lr,
    )

    # collator --------------------------------------------------------------------------#

    eval_dataset = deepcopy(mlm_dataset['test'])

    data_collator = CustomDataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm_probability=cfg.mask_probability
    )

    batch_size = cfg.per_device_batch_size

    train_dataloader = DataLoader(
        mlm_dataset["train"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    eval_dataloader = DataLoader(
        mlm_dataset["test"],
        batch_size=batch_size,
        collate_fn=default_data_collator,
    )

    # show training batch ---
    for batch in train_dataloader:
        break
    show_batch(batch, tokenizer, num_examples=4, print_fn=accelerator.print)

    accelerator.print(f"Train dataset size: {len(mlm_dataset['train'])}")
    accelerator.print(f"Test dataset size: {len(mlm_dataset['test'])}")
    print_line()

    # prepare ---------------------------------------------------------------------------#
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
        model, optimizer, train_dataloader, eval_dataloader
    )
    print_line()

    # scheduler -------------------------------------------------------------------------#
    print_line()
    num_epochs = cfg.num_train_epochs
    grad_accumulation_steps = cfg.gradient_accumulation_steps
    warmup_pct = cfg.warmup_pct

    num_update_steps_per_epoch = len(train_dataloader)//grad_accumulation_steps
    num_training_steps = num_epochs * num_update_steps_per_epoch
    num_warmup_steps = int(warmup_pct*num_training_steps)

    accelerator.print(f"# training updates per epoch: {num_update_steps_per_epoch}")
    accelerator.print(f"# training steps: {num_training_steps}")
    accelerator.print(f"# warmup steps: {num_warmup_steps}")

    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps
    )
    accelerator.wait_for_everyone()

    # training --------------------------------------------------------------------------#
    start_time = time.time()
    current_iteration = 0

    for epoch in range(num_epochs):
        if epoch != 0:
            progress_bar.close()

        progress_bar = tqdm(range(num_update_steps_per_epoch), disable=not accelerator.is_local_main_process)
        loss_meter = AverageMeter()
        model.train()

        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)  # added gradient clip
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()

                loss_meter.update(loss.item())
            # --
            if accelerator.sync_gradients:
                progress_bar.set_description(
                    f"STEP: {current_iteration+1:5}/{num_training_steps:5}. "
                    f"LR: {get_lr(optimizer):.4f}. "
                    f"Loss: {loss_meter.avg:.4f}. "
                )

                progress_bar.update(1)
                current_iteration += 1

            # Evaluation ----
            if (accelerator.sync_gradients) & (current_iteration % cfg.eval_frequency == 0):
                model.eval()
                losses = []

                n_correct = 0
                n_total = 0

                for _, batch in enumerate(eval_dataloader):
                    with torch.no_grad():
                        outputs = model(**batch)

                        tok_preds = torch.max(outputs['logits'], dim=-1)[1]
                        curr = torch.masked_select(tok_preds == batch['labels'], batch['labels'] > -100).sum()
                        tot = torch.masked_select(tok_preds == batch['labels'], batch['labels'] > -100).size(0)
                        n_correct += curr
                        n_total += tot

                    loss = outputs.loss
                    losses.append(accelerator.gather(loss.repeat(batch_size)))

                losses = torch.cat(losses)
                losses = losses[: len(eval_dataset)]

                try:
                    perplexity = math.exp(torch.mean(losses))
                except OverflowError:
                    perplexity = float("inf")

                accuracy = round((n_correct*100/n_total).item(), 2)
                et = as_minutes(time.time()-start_time)
                accelerator.print(
                    f">>> Epoch {epoch+1} | Total Step {current_iteration} | Time: {et}"
                )
                accelerator.print(f">>> Epoch {epoch+1}: Perplexity: {round(perplexity, 2)}")
                accelerator.print(f">>> Epoch {epoch+1}: Accuracy: {accuracy}")

                # Save and upload ---
                accelerator.wait_for_everyone()
                unwrapped_model = accelerator.unwrap_model(model)
                unwrapped_model.save_pretrained(cfg.model_dir, save_function=accelerator.save)
                if accelerator.is_main_process:
                    tokenizer.save_pretrained(cfg.model_dir)
                torch.cuda.empty_cache()
                model.train()
                print_line()

    # --- save model at the end
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(cfg.model_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(cfg.model_dir)
    torch.cuda.empty_cache()
    model.eval()


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)

    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print("Setting DEBUG to False")
        cfg.debug = False

    # execution
    main(cfg)

Writing run_deberta_mlm.py


In [12]:
%%writefile run_train_infer_deberta.py

import sys
sys.path.insert(0, '/kaggle/input/omegaconf')

import argparse
import os
import random
from copy import deepcopy

import pandas as pd
import torch
from accelerate import Accelerator
from datasets import Dataset
from omegaconf import OmegaConf
from scipy.special import expit
from sklearn.metrics import roc_auc_score
from transformers import (AutoTokenizer, DataCollatorWithPadding,
                          DebertaV2ForSequenceClassification, Trainer,
                          TrainingArguments)


class AiDataset:
    """
    Dataset class for LLM Detect AI Generated Text competition
    """

    def __init__(self, tokenizer, max_length=1296):
        self.tokenizer = tokenizer
        self.max_length = max_length

    def tokenize_function(self, examples):
        tz = self.tokenizer(
            examples["text"],
            padding=False,
            truncation=True,
            max_length=self.max_length,
            add_special_tokens=True,
            return_token_type_ids=False,
        )

        return tz

    def compute_input_length(self, examples):
        return {"input_length": [len(x) for x in examples["input_ids"]]}

    def get_dataset(self, df):
        df = deepcopy(df)
        task_dataset = Dataset.from_pandas(df)

        task_dataset = task_dataset.map(self.tokenize_function, batched=True)
        task_dataset = task_dataset.map(self.compute_input_length, batched=True)
        return task_dataset


class BCETrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")

        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


def compute_roc_auc(eval_pred):
    # logits, labels = eval_pred.predictions, eval_pred.label_ids
    logits, labels = eval_pred
    labels = labels.astype(int)
    if labels.std() < 1E-8:  # only one class present in dataset
        return {"roc_auc": 0.0}

    ps = expit(logits).reshape(-1)
    return {"roc_auc": roc_auc_score(labels, ps)}


def main(cfg, save_dir, model_id):
    accelerator = Accelerator()

    # data ----
    rng = random.Random(cfg.seed)
    
    try:
        essay_df = pd.read_csv(cfg.train_data_path)
    except Exception as e:
        essay_df = pd.read_parquet(cfg.train_data_path)
        
    essay_df = essay_df.rename(columns={"generated": "label"})
    print(essay_df.shape)
    
    N = int(cfg.train_frac * len(essay_df))
    essay_df = essay_df.sample(N).reset_index(drop=True)

    essay_df['fold'] = essay_df['text'].apply(
        lambda x: 'train' if rng.random() < 0.99 else 'valid'
    )
    
    train_df = essay_df[essay_df['fold'] == 'train'].copy()
    valid_df = essay_df[essay_df['fold'] == 'valid'].copy()
    valid_df = valid_df.sample(min(1000, len(valid_df))).reset_index(drop=True)

    train_df = train_df.reset_index(drop=True)
    valid_df = valid_df.reset_index(drop=True)
    
    
    
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')
        
    ##########################################################################################
    # PL ---
    accelerator.print("##" + "~~"*40 + "##")
    pl_df = deepcopy(test_df)
    
    score_dfs = []
    
    for sp in cfg.pl_score_paths:
        score_dfs.append(pd.read_parquet(sp).rename(columns={"generated": "label"}))
    scores_df = pd.concat(score_dfs).reset_index(drop=True)
    scores_df = scores_df.groupby("id")["label"].mean().reset_index()
    
    accelerator.print(scores_df.head())
    #-----
    
    pl_df = pd.merge(pl_df, scores_df, on='id', how='inner')
    accelerator.print(f"Shape of PL: {pl_df.shape}")
    
    accelerator.print(f"Shape of train: {train_df.shape}")
    train_df =  train_df[['id', 'text', 'label']].copy()
    pl_df = pl_df[['id', 'text', 'label']].copy()
    train_df = pd.concat([train_df, pl_df, pl_df, pl_df]).reset_index(drop=True) # 4x PL oversample
    accelerator.print(f"Shape of train after merging PL: {train_df.shape}")
    accelerator.print("##" + "~~"*40 + "##")
    ##########################################################################################

    if cfg.debug:
        train_df = train_df.sample(1000, random_state=cfg.seed).reset_index(drop=True)
        valid_df = valid_df.sample(128, random_state=cfg.seed).reset_index(drop=True)

    accelerator.print("##" + "~~"*40 + "##")
    accelerator.print(f"Train df shape: {train_df.shape}")
    accelerator.print(f"Valid df shape: {valid_df.shape}")
    accelerator.print(f"Test df shape: {test_df.shape}")
    accelerator.print("##" + "~~"*40 + "##")

    # tokenizer ---

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_dir)

    # dataset ---
    with accelerator.main_process_first():
        dataset_creator_train = AiDataset(tokenizer, cfg.max_length_train)
        dataset_creator_infer = AiDataset(tokenizer, cfg.max_length_infer)

        train_ds = dataset_creator_train.get_dataset(train_df)
        valid_ds = dataset_creator_train.get_dataset(valid_df)
        test_ds = dataset_creator_infer.get_dataset(test_df)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

    for idx in range(4):
        accelerator.print(f"\n--- Sample {idx} ---\n")
        accelerator.print(repr(tokenizer.decode(train_ds[idx]["input_ids"])))
        accelerator.print("##" + "~~"*40 + "##")

    model = DebertaV2ForSequenceClassification.from_pretrained(cfg.model_dir, num_labels=1)

    # training args ---
    training_args = TrainingArguments(
        output_dir=cfg.model_dir,
        num_train_epochs=1,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        learning_rate=5e-5,
        warmup_steps=0.05,
        weight_decay=0.01,
        logging_dir="logs",
        logging_steps=50,
        report_to="none",
        evaluation_strategy="steps",
        eval_steps=500, # 250
        metric_for_best_model="roc_auc",
        greater_is_better=True,
        max_grad_norm=1.0,
        optim="adamw_bnb_8bit"
    )

    trainer = BCETrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=valid_ds,
        compute_metrics=compute_roc_auc,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()

    # prediction ---
    pred_output = trainer.predict(test_ds)
    logits = pred_output.predictions.astype(float)
    probs = expit(logits).reshape(-1)

    sub_df = pd.DataFrame({
        "id": test_ds['id'],
        "generated": probs
    })

    if accelerator.is_main_process:
        save_path = os.path.join(save_dir, f"{model_id}.parquet")
        sub_df.to_parquet(save_path)
        accelerator.print("done!")
        accelerator.print("~~"*40)


if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    ap.add_argument('--save_dir', type=str, required=True)
    ap.add_argument('--model_id', type=str, required=True)

    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)

    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        print("Setting DEBUG to False")
        cfg.debug = False

    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(cfg.model_dir, exist_ok=True)

    # execution
    main(
        cfg,
        save_dir=args.save_dir,
        model_id=args.model_id,
    )


Writing run_train_infer_deberta.py


In [13]:
%%writefile ./configs/conf_deberta_mlm.yaml

seed: 42
debug: true
    
train_data_path: /kaggle/input/persaude-corpus-2/persuade_2.0_human_scores_demo_id_github.csv

max_length: 1024
vocab_size: 4096
train_frac_for_tokenizer: 0.5
train_frac_for_mlm: 0.8
mask_probability: 0.20

lr: 4e-5
per_device_batch_size: 4
gradient_accumulation_steps: 4
weight_decay: 0.01

num_train_epochs: 4
warmup_pct: 0.05
test_pct: 0.005
max_train_examples: 100_000
eval_frequency: 100 # 1024
max_grad_norm: 1.0

model_dir: "./models/deberta_v3_small_persuade"


Writing ./configs/conf_deberta_mlm.yaml


In [14]:
%%writefile ./configs/conf_deberta.yaml

seed: 42
debug: true
train_data_path: /kaggle/input/traindeberta/train_essays.csv
    
train_frac: 0.5
max_length_train: 512
max_length_infer: 1296
    
pl_score_paths:
    - ./outputs/m0.parquet
    - ./outputs/m1.parquet
model_dir: "./models/deberta_v3_small_persuade"

Writing ./configs/conf_deberta.yaml


In [15]:
%%time
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_deberta_mlm.py \
--config_path "./configs/conf_deberta_mlm.yaml"

2024-04-11 10:31:32.707749: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:31:32.707751: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:31:32.707808: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:31:32.707817: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:31:32.709899: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

In [16]:
%%time
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_train_infer_deberta.py \
--config_path "./configs/conf_deberta.yaml" \
--save_dir "./outputs" \
--model_id "m4"

2024-04-11 10:53:52.742616: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:53:52.742683: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 10:53:52.744201: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-11 10:53:52.759239: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 10:53:52.759286: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factor

In [17]:
%%writefile run_ahmet_approach.py

import argparse
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn


class TextMatcher:
    def __init__(self, ground_truth, col, topk=5, lower_bound=-1):
        self.ground_truth = ground_truth
        self.vec = TfidfVectorizer(ngram_range=(1, 2), analyzer="word", token_pattern=r"(?u)(\b\w\w+\b|[\.,!])",
                                   use_idf=False, min_df=2, binary=True)
        self.topk = topk
        self.lower_bound = lower_bound
        self.col = col
        
    def get_matches_df(self, sparse_matrix, texts):
        non_zeros = sparse_matrix.nonzero()

        text_indices = non_zeros[0]
        gt_indices = non_zeros[1]

        left_side = np.empty(gt_indices.size, dtype=object)
        right_side = np.empty(gt_indices.size, dtype=object)
        match_score = np.zeros(gt_indices.size)

        for index in range(gt_indices.size):
            left_side[index] = texts.values[text_indices[index]]
            right_side[index] = self.ground_truth[self.col].values[gt_indices[index]]
            match_score[index] = sparse_matrix.data[index]

        res_df = pd.DataFrame({self.col: left_side,
                               'Ground Truth': right_side,
                               'match_score': match_score})

        res_df = pd.DataFrame(texts).merge(res_df, on=self.col, how="left")
        return res_df


    def match(self, texts_to_match, n_threads=16):
        print(f"Matching {texts_to_match.shape[0]} texts to {self.ground_truth.shape[0]} texts...")
        
        X = self.vec.fit_transform(texts_to_match[self.col])
        X_gt = self.vec.transform(self.ground_truth[self.col])
        
        sparse_sim = awesome_cossim_topn(X, X_gt.T, self.topk, self.lower_bound, use_threads=True, n_jobs=n_threads)
        
        return self.get_matches_df(sparse_sim, texts_to_match[self.col])

#---------------------------------------------------------------------------------------------------------#
# DATA
#---------------------------------------------------------------------------------------------------------#

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--score_path', type=str, required=True)
    ap.add_argument('--save_dir', type=str, required=True)
    ap.add_argument('--model_id', type=str, required=True)

    args = ap.parse_args()
    os.makedirs(args.save_dir, exist_ok=True)

    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')

    all_prompts = df["prompt_id"].unique()

    HUMAN_TH = 0.1
    LLM_TH = 0.99
    MIN_N = min(64, len(df))

    # Weak Supervision ---
    scores_df = pd.read_parquet(args.score_path)
    df = pd.merge(df, scores_df, on='id', how='inner')

    df_list = []
    for pid in all_prompts:
        cdf = df[df['prompt_id']==pid].copy() 
        cdf = cdf.sort_values(by='generated').reset_index(drop=True)

        cdf["likely_student"] = cdf["generated"].apply(lambda x: x<=HUMAN_TH)
        cdf["likely_llm"] = cdf["generated"].apply(lambda x: x>=LLM_TH)

        if cdf["likely_student"].sum() < MIN_N:

            cdf.loc[:MIN_N, "likely_student"] = True

        if cdf["likely_llm"].sum() < MIN_N:
            cdf.loc[cdf.shape[0] - MIN_N:, "likely_llm"] = True
        print(cdf.head())
        print(cdf.tail())
        print("=="*40)
        df_list.append(cdf)

    df = pd.concat(df_list).reset_index(drop=True)
    df = df.drop(columns=['generated'])

    #---------------------------------------------------------------------------------------------------------#
    # MATCHING
    #---------------------------------------------------------------------------------------------------------#

    TOPK = min(64, len(df))
    
    def agg_fn(scores, margin=0.5):
        max_score = max(scores)
        th = (1.0 - margin) * max_score
        kept_scores = [s for s in scores if s >= th]
        ret = np.mean(kept_scores)
        return ret

    def get_match_score(df, gt_filter_col):
        tm = TextMatcher(df[df[gt_filter_col]].reset_index(drop=True), "text", topk=TOPK)
        res_df = tm.match(df, n_threads=4)
        df = res_df.groupby("text")["match_score"].agg(agg_fn).reset_index().merge(df, on="text")
        return df


    sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_student").reset_index(drop=True)[["id", "match_score"]]
               for pid in all_prompts] # TODO: may cause exception?
    sub_df = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_student"})


    sub_dfs = [get_match_score(df[df["prompt_id"] == pid], "likely_llm").reset_index(drop=True)[["id", "match_score"]]
               for pid in all_prompts]
    sub_df2 = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_llm"})

    sub_df = sub_df.merge(sub_df2, on="id")

    SMOOTH = 0.15

    sub_df["generated"] = -sub_df["match_score_student"] / (sub_df["match_score_llm"] + SMOOTH)
    sub_df = sub_df[["id", "generated"]].copy()

    sub_df.to_parquet(f"./{args.save_dir}/{args.model_id}.parquet")

Writing run_ahmet_approach.py


In [18]:
%%writefile run_ahmet_approach_with_train_leverage.py

import argparse
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sparse_dot_topn import awesome_cossim_topn
from copy import deepcopy

class TextMatcher:
    def __init__(self, ground_truth, col, topk=5, lower_bound=-1):
        self.ground_truth = ground_truth
        self.vec = TfidfVectorizer(ngram_range=(1, 2), analyzer="word", token_pattern=r"(?u)(\b\w\w+\b|[\.,!])",
                                   use_idf=False, min_df=2, binary=True)
        self.topk = topk
        self.lower_bound = lower_bound
        self.col = col
        
    def get_matches_df(self, sparse_matrix, texts):
        non_zeros = sparse_matrix.nonzero()

        text_indices = non_zeros[0]
        gt_indices = non_zeros[1]

        left_side = np.empty(gt_indices.size, dtype=object)
        right_side = np.empty(gt_indices.size, dtype=object)
        match_score = np.zeros(gt_indices.size)

        for index in range(gt_indices.size):
            left_side[index] = texts.values[text_indices[index]]
            right_side[index] = self.ground_truth[self.col].values[gt_indices[index]]
            match_score[index] = sparse_matrix.data[index]

        res_df = pd.DataFrame({self.col: left_side,
                               'Ground Truth': right_side,
                               'match_score': match_score})

        res_df = pd.DataFrame(texts).merge(res_df, on=self.col, how="left")
        return res_df


    def match(self, texts_to_match, n_threads=16):
        print(f"Matching {texts_to_match.shape[0]} texts to {self.ground_truth.shape[0]} texts...")
        
        X = self.vec.fit_transform(texts_to_match[self.col])
        X_gt = self.vec.transform(self.ground_truth[self.col])
        
        sparse_sim = awesome_cossim_topn(X, X_gt.T, self.topk, self.lower_bound, use_threads=True, n_jobs=n_threads)
        
        return self.get_matches_df(sparse_sim, texts_to_match[self.col])

#---------------------------------------------------------------------------------------------------------#
# DATA
#---------------------------------------------------------------------------------------------------------#

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--save_dir', type=str, required=True)
    ap.add_argument('--model_id', type=str, required=True)

    args = ap.parse_args()
    os.makedirs(args.save_dir, exist_ok=True)

    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')
        
    # add in prompt name ---
    gdf = test_df.groupby("prompt_id")["id"].agg(len).reset_index().rename(columns={"id": "count"})
    gdf = gdf.sort_values(by='count')

    prompt_order = [
        'Facial action coding system',
        'Exploring Venus',
        'A Cowboy Who Rode the Waves',
        'The Face on Mars',
        'Driverless cars',
    ]

    gdf['prompt_name'] = prompt_order[:len(gdf)]
    prompt_id2prompt_name = dict(zip(gdf['prompt_id'], gdf['prompt_name']))
    test_df['prompt_name'] = test_df['prompt_id'].map(prompt_id2prompt_name)
    print(test_df.head())
    
    #-----------------------

    all_prompt_names = test_df["prompt_name"].unique().tolist()
    
    # prepare lables --
    train_df = pd.read_parquet("/kaggle/input/d402-prepare-train-for-retrieval/train_for_retrieval.parquet")
    train_df = train_df.drop_duplicates(subset=['text']).reset_index(drop=True)
    
    train_df['likely_student'] = train_df['generated'].apply(lambda x: x<=0.001)
    train_df['likely_llm'] = train_df['generated'].apply(lambda x: x>=0.999)

    train_df = train_df.drop(columns=['generated'])

    #---------------------------------------------------------------------------------------------------------#
    # MATCHING
    #---------------------------------------------------------------------------------------------------------#

    
    TOPK = min(64, len(test_df))
    
    def agg_fn(scores, margin=0.5):
        max_score = max(scores)
        th = (1.0 - margin) * max_score
        kept_scores = [s for s in scores if s >= th]
        ret = np.mean(kept_scores)
        return ret

    def get_match_score(true_df, infer_df):
        true_df = deepcopy(true_df)
        infer_df = deepcopy(infer_df)
        
        true_df = true_df.reset_index(drop=True)
        tm = TextMatcher(true_df, "text", topk=TOPK)
        
        res_df = tm.match(infer_df, n_threads=4)
        df = res_df.groupby("text")["match_score"].agg(agg_fn).reset_index().merge(infer_df, on="text")
        return df


    # human match --
    sub_dfs = []
    for pname in prompt_order:
        pdf = train_df[train_df['prompt_name'] == pname].copy()
        true_df = pdf[pdf['likely_student']].copy()
        
        infer_df = test_df[test_df['prompt_name'] == pname].copy()
        infer_df = infer_df.reset_index(drop=True)
        
        r = get_match_score(true_df, infer_df)
        sub_dfs.append(r)
        
    sub_df_human = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_student"})
    
    # LLM match --
    sub_dfs = []
    for pname in prompt_order:
        pdf = train_df[train_df['prompt_name'] == pname].copy()
        true_df = pdf[pdf['likely_llm']].copy()
        
        infer_df = test_df[test_df['prompt_name'] == pname].copy()
        infer_df = infer_df.reset_index(drop=True)
        
        r = get_match_score(true_df, infer_df)
        sub_dfs.append(r)
        
    sub_df_llm = pd.concat(sub_dfs).rename(columns={"match_score": "match_score_llm"})
    #----

    sub_df = pd.merge(sub_df_human, sub_df_llm, on="id")
    print(sub_df.head())

    SMOOTH = 0.15

    sub_df["generated"] = -sub_df["match_score_student"] / (sub_df["match_score_llm"] + SMOOTH)
    sub_df = sub_df[["id", "generated"]].copy()

    sub_df.to_parquet(f"./{args.save_dir}/{args.model_id}.parquet")

Writing run_ahmet_approach_with_train_leverage.py


In [19]:
!python run_ahmet_approach.py \
--score_path ./outputs/m0.parquet \
--save_dir "./outputs" \
--model_id "m5"

           id  prompt_id  ... likely_student  likely_llm
0  e_lvrfnxan          5  ...           True        True
1  e_9lzxlyp0          5  ...           True        True
2  e_knxffsz4          5  ...           True        True
3  e_1ituvpmv          5  ...           True        True
4  e_cv19bwix          5  ...           True        True

[5 rows x 6 columns]
           id  prompt_id  ... likely_student  likely_llm
5  e_yshmqlkt          5  ...           True        True
6  e_g0adeocf          5  ...           True        True
7  e_c08bnr4e          5  ...           True        True
8  e_4kog18mm          5  ...           True        True
9  e_u2pku451          5  ...           True        True

[5 rows x 6 columns]
           id  prompt_id  ... likely_student  likely_llm
0  e_sbwqyx5l          6  ...           True        True
1  e_371zu85b          6  ...           True        True
2  e_oy0mocld          6  ...           True        True
3  e_y8cupfs6          6  ...           True

In [20]:
!python run_ahmet_approach_with_train_leverage.py \
--save_dir "./outputs" \
--model_id "m6"

           id  ...                  prompt_name
0  e_4kog18mm  ...  Facial action coding system
1  e_9lzxlyp0  ...  Facial action coding system
2  e_c08bnr4e  ...  Facial action coding system
3  e_lvrfnxan  ...  Facial action coding system
4  e_1ituvpmv  ...  Facial action coding system

[5 rows x 4 columns]
Matching 10 texts to 2656 texts...
Matching 20 texts to 2322 texts...
Matching 30 texts to 1972 texts...
Matching 40 texts to 2132 texts...
Matching 50 texts to 2309 texts...
Matching 10 texts to 10621 texts...
Matching 20 texts to 10619 texts...
Matching 30 texts to 9112 texts...
Matching 40 texts to 9847 texts...
Matching 50 texts to 10491 texts...
                                              text_x  ...                prompt_name_y
0  #Technology Has Changed Over Past Decade Essay...  ...  Facial action coding system
1  5 reasons the technology could be valuable are...  ...  Facial action coding system
2  Computers don t need to know if someone is fee...  ...  Facial action cod

In [26]:
%%writefile run_deb_ranking_inference_rb.py

import sys

sys.path.insert(0, '/kaggle/input/omegaconf')
sys.path.insert(0, '/kaggle/input/utils-ai-v10')

import argparse
import os
import gc

import pandas as pd
import torch
from accelerate import Accelerator
from omegaconf import OmegaConf
from peft import PeftModel

from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig

from r_ranking.ai_dataset import AiDataset
from r_ranking.ai_loader import AiCollator, show_batch
from r_ranking.ai_model import AiModel

char_to_remove = ['{', '£', '\x97', '¹', 'å', '\\', '\x85', '<', '\x99', \
                  'é', ']', '+', 'Ö', '\xa0', '>', '|', '\x80', '~', '©', \
                  '/', '\x93', '$', 'Ó', '²', '^', ';', '`', 'á', '*', '(', \
                  '¶', '®', '[', '\x94', '\x91', '#', '-', 'ó', ')', '}', '=']

def preprocess_text(text):
    text = text.encode("ascii", "ignore").decode('ascii')        
    text = text.strip()
    text = text.strip("\"")

    for c in char_to_remove:
        text = text.replace(c, "")

    if text[-1]!=".":
        text = text.split(".")
        text = ".".join(text[:-1])
        text += "."
    return text


def run_inference(accelerator, model, infer_dl, example_ids):
    model.eval()
    all_predictions = []

    progress_bar = tqdm(range(len(infer_dl)), disable=not accelerator.is_local_main_process)

    for step, batch in enumerate(infer_dl):
        with torch.no_grad():
            logits, _ = model(**batch)

        logits = logits.reshape(-1)
        predictions = torch.sigmoid(logits)
        predictions = accelerator.gather_for_metrics(predictions)
        predictions = predictions.cpu().numpy().tolist()

        all_predictions.extend(predictions)

        progress_bar.update(1)
    progress_bar.close()

    result_df = pd.DataFrame()
    result_df["id"] = example_ids
    result_df["generated"] = all_predictions

    return result_df

def main(cfg, save_dir, model_id):
    
    # create accelerator
    accelerator = Accelerator()
    
    # read test data
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        test_df = pd.read_csv("/kaggle/input/textdata/test_essays.csv", sep=',')
    else:
        test_df = pd.read_csv("/kaggle/input/textdata/test.csv", sep=',')

    test_df['text'] = test_df['text'].apply(preprocess_text)
    accelerator.print(f'Test csv shape: {test_df.shape}')
    test_df['generated'] = 1 # TODO: NEEDED NOW, FIx it 
    
    with accelerator.main_process_first():
        dataset_creator = AiDataset(cfg)
        infer_ds = dataset_creator.get_dataset(test_df)
    
    tokenizer = dataset_creator.tokenizer
    
    infer_ds = infer_ds.sort("input_length")
    infer_ds.set_format(
        type=None,
        columns=[
            'id',
            'input_ids',
            'attention_mask',
            'generated'
        ]
    )
    
    infer_ids = infer_ds["id"]  # .tolist()
    
    #--
    data_collator = AiCollator(
        tokenizer=tokenizer,
        pad_to_multiple_of=64
    )

    infer_dl = DataLoader(
        infer_ds,
        batch_size=cfg.predict_params.per_device_eval_batch_size,
        shuffle=False,
        collate_fn=data_collator,
    )

    accelerator.print("data preparation done...")
    accelerator.print("~~"*40)
    accelerator.wait_for_everyone()
    
    
    #----------
    for b in infer_dl:
        break
    show_batch(b, tokenizer, task='infer', print_fn=accelerator.print)
    accelerator.print("~~"*40)
    #----------
    # model -----------------------------------------------------------------------------#
    model = AiModel(cfg, accelerator.device)

    checkpoint_path = cfg.predict_params.checkpoint_path
    accelerator.print("=="*50)
    accelerator.print(f"loading model from checkpoint: {checkpoint_path}")
    
    ckpt = torch.load(checkpoint_path)
    model.load_state_dict(ckpt['state_dict'])
    del ckpt
    gc.collect()
    print("loaded!")
    accelerator.print("### Loaded Model Weights ###")
    
    model, infer_dl = accelerator.prepare(model, infer_dl)
    
    # run inference ---
    sub_df = run_inference(accelerator, model, infer_dl, infer_ids)
    
    accelerator.wait_for_everyone()
    
    if accelerator.is_main_process:
        save_path = os.path.join(save_dir, f"{model_id}.parquet")
        sub_df.to_parquet(save_path)
        accelerator.print("done!")
        accelerator.print("~~"*40)
    
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    ap.add_argument('--save_dir', type=str, required=True)
    ap.add_argument('--model_id', type=str, required=True)

    args = ap.parse_args()
    cfg = OmegaConf.load(args.config_path)

    os.makedirs(args.save_dir, exist_ok=True)

    # execution
    main(
        cfg,
        save_dir=args.save_dir,
        model_id=args.model_id,
    )


Overwriting run_deb_ranking_inference_rb.py


In [27]:
%%writefile ./configs/conf_deb_ranking_rb.yaml

model:
    backbone_path: /kaggle/input/deberta-v3-large
    max_length: 1296 # 128
    dropout_rate: 0.0
    gradient_checkpointing: true

predict_params:
    checkpoint_path: /kaggle/input/deberta-v3-large-ranking/detect_ai_model_last.pth.tar
    per_device_eval_batch_size: 1

Overwriting ./configs/conf_deb_ranking_rb.yaml


In [28]:
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_deb_ranking_inference_rb.py \
--config_path "./configs/conf_deb_ranking_rb.yaml" \
--save_dir "./outputs" \
--model_id "m7"


2024-04-11 11:04:04.498370: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 11:04:04.498385: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-11 11:04:04.498426: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 11:04:04.498437: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-11 11:04:04.500073: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

In [29]:
%mkdir /kaggle/working/ghostbuster-temp
%cd /kaggle/working/ghostbuster-temp
%cp /kaggle/input/ghosbuster-scripts-v1/*.py .

!pip uninstall -y pandas
!pip install /kaggle/input/pip-install-many-whl/pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

!pip uninstall -y datasets
!pip install datasets --no-index --find-links=file:///kaggle/input/hf-ds -U -q

DEBUG = 0

MODEL_DIR = "/kaggle/input/gb-overfit-rfc-svc"
# "/kaggle/input/custom-gb-model-v1"
#"/kaggle/input/ghostb-100k-m20"
validation_file_dir = MODEL_DIR if DEBUG else "none" 

llama_7b_path = "/kaggle/input/llama-2/pytorch/7b-hf/1"
tinyllama_path = "/kaggle/input/finetuned-tiny-llama"
#"/kaggle/input/tinyllama-tinyllama-1-1b-chat-v1-0"

text_files_dir = "/tmp/text-files"

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    csv_file = "train_essays.csv" if DEBUG else "test_essays.csv"
    csv_path = "/kaggle/input/llm-detect-ai-generated-text/" + csv_file
else:
    csv_path = "/kaggle/input/mock-test/test.csv"



model1 = "llama-7b"
model2 = "tinyllama"

num_rows = -1 # 500 if DEBUG else -1

!python process_text.py \
    --csv_path $csv_path \
    --output_dir $text_files_dir \
    --num_proc 2 \
    --num_rows $num_rows \
    --validation_dir $validation_file_dir

!python run_llm.py \
    --model_name_or_path $llama_7b_path \
    --path_to_text_files $text_files_dir \
    --batch_size 4 \
    --model_name $model1 \
    --device_map_auto
    
# !python run_llm.py --model_name_or_path $tinyllama_path --path_to_text_files $text_files_dir --batch_size 4 --model_name "llama-7b" --device_map_auto
!accelerate launch --num_processes=2 --multi_gpu --mixed_precision=fp16 run_llm.py \
    --model_name_or_path $tinyllama_path \
    --path_to_text_files $text_files_dir \
    --batch_size 8 \
    --model_name $model2 

!python run_custom_gb2.py \
    --model_dir $MODEL_DIR \
    --model1 $model1 \
    --model2 $model2 \
    --tokenizer_name "/kaggle/input/tinyllama-tinyllama-1-1b-chat-v1-0" \
    --text_dir $text_files_dir \
    --output_path "gb_sub.csv" \
    --debug $DEBUG \
    --num_proc 2

!mv gb_sub.csv /kaggle/working/outputs/mgb.csv

%cd /kaggle/working


/kaggle/working/ghostbuster-temp
cp: cannot stat '/kaggle/input/ghosbuster-scripts-v1/*.py': No such file or directory
Found existing installation: pandas 2.1.4
Uninstalling pandas-2.1.4:
  Successfully uninstalled pandas-2.1.4
[0mProcessing /kaggle/input/pip-install-many-whl/pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
[31mERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: '/kaggle/input/pip-install-many-whl/pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl'
[0m[31m
[0mFound existing installation: datasets 2.1.0
Uninstalling datasets-2.1.0:
  Successfully uninstalled datasets-2.1.0
[0m[31mERROR: Could not find a version that satisfies the requirement datasets (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for datasets[0m[31m
[0mpython: can't open file '/kaggle/working/ghostbuster-temp/process_text.py': [Errno 2] No such file or directory
python: can't open f

In [None]:
! pip uninstall pandas 
! pip install pandas
import pandas as pd
sub_df_m0 = pd.read_parquet("./outputs/m0.parquet")  # mistral
sub_df_m1 = pd.read_parquet("./outputs/m1.parquet")  # mistral

sub_df_m2 = pd.read_parquet("./outputs/m2.parquet")  # deberta-ub

# sub_df_m3 = pd.read_parquet("./outputs/m3.parquet")  # tf-idf

sub_df_m4 = pd.read_parquet("./outputs/m4.parquet")  # pl-deberta


sub_df_m5 = pd.read_parquet("./outputs/m5.parquet")  # ahmet
sub_df_m6 = pd.read_parquet("./outputs/m6.parquet")  # ahmet

sub_df_m7 = pd.read_parquet("./outputs/m7.parquet")  # deberta-rb

#sub_df_m8 = pd.read_csv("./outputs/mgb.csv")  # 👻



Found existing installation: pandas 2.2.1
Uninstalling pandas-2.2.1:
  Would remove:
    /opt/conda/lib/python3.10/site-packages/pandas-2.2.1.dist-info/*
    /opt/conda/lib/python3.10/site-packages/pandas/core/_numba/extensions.py
    /opt/conda/lib/python3.10/site-packages/pandas/core/arrays/_utils.py
    /opt/conda/lib/python3.10/site-packages/pandas/core/arrays/arrow/accessors.py
    /opt/conda/lib/python3.10/site-packages/pandas/io/excel/_calamine.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/apply/test_numba.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/arrays/interval/test_formats.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/arrays/interval/test_interval_pyarrow.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/arrays/interval/test_overlaps.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/copy_view/test_chained_assignment_deprecation.py
    /opt/conda/lib/python3.10/site-packages/pandas/tests/frame/methods/test_info.

In [None]:
Y


In [None]:
# # convert to rankings ---
sub_df_m0["generated"] = sub_df_m0["generated"].rank(method='min')
sub_df_m1["generated"] = sub_df_m1["generated"].rank(method='min')
sub_df_m2["generated"] = sub_df_m2["generated"].rank(method='min')
# sub_df_m3["generated"] = sub_df_m3["generated"].rank(method='min')
sub_df_m4["generated"] = sub_df_m4["generated"].rank(method='min')
sub_df_m5["generated"] = sub_df_m5["generated"].rank(method='min')
sub_df_m6["generated"] = sub_df_m6["generated"].rank(method='min')
sub_df_m7["generated"] = sub_df_m7["generated"].rank(method='min')
#sub_df_m8["generated"] = sub_df_m8["generated"].rank(method='min')

In [None]:
sub_df = pd.concat([
    # mistral --- weight = 8
    sub_df_m0,
    sub_df_m1,
    sub_df_m0,
    sub_df_m1,
    sub_df_m0,
    sub_df_m1,
#     sub_df_m0,
#     sub_df_m1,

    # deberta - ub
    sub_df_m2,
    sub_df_m2,
    
    # tf-idf
#     sub_df_m3,
#     sub_df_m3,
    
    # pl-mlm-deb
    sub_df_m4,   
    sub_df_m4,
    
    # ahmet --
    sub_df_m5,
    sub_df_m6,
    
    # deberta rb -
    sub_df_m7,
    
    # 👻
   # sub_df_m8,
    #sub_df_m8,
    #sub_df_m8,
])

sub_df = sub_df.groupby("id")["generated"].mean().reset_index()

In [None]:
sub_df.to_csv("submission.csv", index=False)