# You are bot competition

In [1]:
import os
import math
import random
import pandas as pd
import numpy as np
import json
import torch
import torch.nn.functional as F
from torch import Tensor
import matplotlib.pyplot as plt
import seaborn as sns

from copy import deepcopy
from collections import Counter, defaultdict
from transformers import AutoModel, AutoTokenizer
from tqdm.auto import tqdm, trange

import optuna
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss

In [2]:
random.seed(42)
np.random.seed(42)

In [3]:
def load_train_data(data_file: str, labels_file: str):
    all_texts = []
    all_labels = []

    labels_df = pd.read_csv(labels_file)
    labels_df = labels_df[labels_df["participant_index"] == 0]
    labels_dict = dict(zip(labels_df["dialog_id"], labels_df["is_bot"]))

    with open(data_file, "r", encoding="utf-8") as f:

        data = json.load(f)
        for key in data.keys():
            messages = data[key]

            part_0_texts = [
                m["text"] for m in messages if m["participant_index"] == "0"
            ]
            part_1_texts = [
                m["text"] for m in messages if m["participant_index"] == "1"
            ]

            part_0_label = int(labels_dict[key])
            part_1_label = 1 - part_0_label

            text_0 = " ".join(part_0_texts)
            text_1 = " ".join(part_1_texts)

            all_texts.append(text_0)
            all_labels.append(part_0_label)

            all_texts.append(text_1)
            all_labels.append(part_1_label)

    df = pd.DataFrame({"text": all_texts, "is_bot": all_labels})
    return df


def load_test_data(data_file: str, labels_file: str):
    df_info = pd.read_csv(labels_file)

    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    all_texts = []
    ids = []

    for _, row in df_info.iterrows():
        dialog_id = row["dialog_id"]
        participant_index = str(row["participant_index"])
        messages = data[dialog_id]

        texts = [
            m["text"] for m in messages if m["participant_index"] == participant_index
        ]
        combined_text = " ".join(texts)
        all_texts.append(combined_text)
        ids.append(row["ID"])

    df = pd.DataFrame({"ID": ids, "text": all_texts})
    return df

In [4]:
df = load_train_data(os.path.join('data', 'train.json'), os.path.join('data', 'ytrain.csv'))
X = df["text"]
y = df["is_bot"]

In [5]:
df_test = load_test_data(os.path.join('data', 'test.json'), os.path.join('data', 'ytest.csv'))

In [6]:
df.head()

Unnamed: 0,text,is_bot
0,Hello! Отлично! А твои? Расскажи теорему,0
1,Как дела? Это круто!,1
2,Привет никак оооокккееуу оууукккии оооуууллкке...,0
3,Привет! Как я могу помочь тебе сегодня? Хорошо...,1
4,Привет Ты бот?,0


## Training №1 - Baseline

Baseline - копия кода из ноутбука на Kaggle

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [8]:
pipe = Pipeline(
    [
        ("vectorizer", TfidfVectorizer()),
        ("model", LogisticRegression(random_state=42)),
    ]
)

pipe.fit(X_train, y_train)

val_pred = pipe.predict(X_test)
val_proba = pipe.predict_proba(X_test)
val_acc = accuracy_score(y_test, val_pred)
val_roc = roc_auc_score(y_test, val_proba[:, 1])
val_logloss = log_loss(y_test, val_proba)
print("Val Accuracy:", val_acc)
print("Val ROC AUC:", val_roc)
print("Val Log Loss:", val_logloss)

Val Accuracy: 0.5904761904761905
Val ROC AUC: 0.6284971377892445
Val Log Loss: 0.6722139357476833


## Baseline using XGBoost

Baseline решение, использующее алгоритмы градиентного бустинга и матрицу TF-IDF.

In [None]:
boosting_pipeline = Pipeline(
    [
        ("vectorizer", TfidfVectorizer()),
        ("model", XGBClassifier(n_estimators = 5000, 
                                n_jobs = -1, 
                                random_state = 25, 
                                max_depth = 12,
                                max_leaves = 25,
                                learning_rate = 1e-3)),
    ]
)

boosting_pipeline.fit(X_train, y_train)

Расчёт качества базовой модели без подбора гиперпараметров:

In [None]:
val_pred = boosting_pipeline.predict(X_test)
val_proba = boosting_pipeline.predict_proba(X_test)
val_acc = accuracy_score(y_test, val_pred)
val_roc = roc_auc_score(y_test, val_proba[:, 1])
val_logloss = log_loss(y_test, val_proba)
print("Val Accuracy:", val_acc)
print("Val ROC AUC:", val_roc)
print("Val Log Loss:", val_logloss)

Подбор гиперпараметров проведём с помощью optuna.

In [None]:
def objective(trial):

    n_gram = trial.suggest_int('n_gram', 1, 3)
    lr_rate = trial.suggest_float("learning_rate", 1e-5, 5e-3, step=1e-5)
    num_estimators = trial.suggest_int('num_estimators', 200, 10000)
    tree_depth = trial.suggest_int('depth', 3, 15)
    num_leaves = trial.suggest_int('n_leaves', 10, 100)

    boosting_pipeline = Pipeline(
        [
            ("vectorizer", TfidfVectorizer(ngram_range=(1, n_gram))),
            ("model", XGBClassifier(n_estimators = num_estimators,
                                    random_state = 25, 
                                    max_depth = tree_depth,
                                    max_leaves = num_leaves,
                                    learning_rate = lr_rate)),
        ]
    )

    boosting_pipeline.fit(X_train, y_train)
    
    val_pred = boosting_pipeline.predict(X_test)
    val_proba = boosting_pipeline.predict_proba(X_test)
    val_acc = accuracy_score(y_test, val_pred)
    val_roc = roc_auc_score(y_test, val_proba[:, 1])
    val_logloss = log_loss(y_test, val_proba)

    return val_roc

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=15, n_jobs = -1)

In [None]:
study.best_params

In [None]:
print('Сводная таблица оптимизационных параметров\n')
study.trials_dataframe().sort_values('value')

In [None]:
boosting_pipeline = Pipeline(
        [
            ("vectorizer", TfidfVectorizer(ngram_range=(1, study.best_params['n_gram']))),
            ("model", XGBClassifier(n_estimators = study.best_params['num_estimators'],
                                    random_state = 25, 
                                    max_depth = study.best_params['depth'],
                                    max_leaves = study.best_params['n_leaves'],
                                    learning_rate = study.best_params['learning_rate'])),
        ]
    )

boosting_pipeline.fit(df['text'], df['is_bot'])

In [None]:
test_proba = boosting_pipeline.predict_proba(df_test["text"])[:, 1]

preds_df = pd.DataFrame({"ID": df_test["ID"], "is_bot": test_proba})
preds_df.to_csv("preds_XGBoost.csv", index=False)

## LLM - детектор

In [9]:
def calculate_perplexity(text, model, tokenizer, device):
    # Токенизация
    encodings = tokenizer(text, return_tensors="pt", truncation=True)
    encodings = {k: v.to(device) for k, v in encodings.items()}

    # Вычисление mean negative log-likelihood (nll) per token
    # (стандартный loss для LLM в библиотеке transformers)
    input_ids = encodings["input_ids"]
    with torch.no_grad():
        outputs = model(**encodings, labels=input_ids)
        neg_log_likelihood = outputs.loss.item()

    perplexity = math.exp(neg_log_likelihood) # Перплексия вычисляется как e ^ nll
    return perplexity

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [11]:
emb_model = 'intfloat/multilingual-e5-large-instruct'

tokenizer = AutoTokenizer.from_pretrained(emb_model)
model = AutoModel.from_pretrained(emb_model, trust_remote_code=True)

model = model.to(device)

In [12]:
# Tokenize the input texts
batch_size = 4
n_batches = len(df) // batch_size

embeds = np.zeros((len(df), model.config.hidden_size))
for i in trange(n_batches):
    
    batch_dict = tokenizer(df['text'].values[i*batch_size:i*batch_size+batch_size].tolist(), 
                           max_length=512, 
                           padding=True, 
                           truncation=True, 
                           return_tensors='pt').to(device)

    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    
    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    embeds[i*batch_size:i*batch_size+batch_size] = embeddings.cpu().detach().numpy()
    del embeddings

torch.cuda.empty_cache()

df['embeds'] = tuple(embeds)

  0%|          | 0/393 [00:00<?, ?it/s]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    df['embeds'], df['is_bot'], test_size=0.2, random_state=42, stratify=y
)

In [14]:
clf = LogisticRegression(n_jobs=-1, random_state=42, max_iter=500)

In [15]:
clf.fit(np.vstack(X_train), y_train)

In [16]:
val_pred = clf.predict(np.vstack(X_test))
val_proba = clf.predict_proba(np.vstack(X_test))
val_acc = accuracy_score(y_test, val_pred)
val_roc = roc_auc_score(y_test, val_proba[:, 1])
val_logloss = log_loss(y_test, val_proba)
print("Val Accuracy:", val_acc)
print("Val ROC AUC:", val_roc)
print("Val Log Loss:", val_logloss)

Val Accuracy: 0.5968253968253968
Val ROC AUC: 0.6167862613883738
Val Log Loss: 0.6671195625584835


# Scoring

In [17]:
# Tokenize the input texts
batch_size = 4
n_batches = len(df_test) // batch_size

embeds_t = np.zeros((len(df_test), model.config.hidden_size))
for i in trange(n_batches):
    
    batch_dict = tokenizer(df_test['text'].values[i*batch_size:i*batch_size+batch_size].tolist(), 
                           max_length=512, 
                           padding=True, 
                           truncation=True, 
                           return_tensors='pt').to(device)

    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    
    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    embeds_t[i*batch_size:i*batch_size+batch_size] = embeddings.cpu().detach().numpy()
    del embeddings

  0%|          | 0/169 [00:00<?, ?it/s]

In [18]:
torch.cuda.empty_cache()
del model

In [19]:
def objective(trial):

    clf_n = trial.suggest_categorical('booster', ['XGB', 'Catboost', 'LogReg'])
    
    if clf_n == 'XGB':
        lr_rate = trial.suggest_float("learning_rate", 1e-5, 5e-3, step=1e-5)
        num_estimators = trial.suggest_int('num_estimators', 200, 3000)
        tree_depth = trial.suggest_int('depth', 3, 15)
        num_leaves = trial.suggest_int('n_leaves', 10, 100)

        clf_model = XGBClassifier(random_state = 25, verbosity = 0)
        clf_model.set_params(**{'n_estimators': num_estimators,
                            'max_depth': tree_depth,
                            'max_leaves': num_leaves,
                            'learning_rate': lr_rate})
    
    elif clf_n == 'Catboost':
        lr_rate = trial.suggest_float("learning_rate", 1e-5, 5e-3, step=1e-5)
        num_estimators = trial.suggest_int('num_estimators', 200, 3000)
        clf_model = CatBoostClassifier(random_state = 25, verbose = False)
        clf_model.set_params(**{'n_estimators': num_estimators,
                            'learning_rate': lr_rate})

    else:
        iters = trial.suggest_int("iters", 300, 5000)
        penalty =  trial.suggest_categorical('reg', ['l1', 'l2'])
        clf_model = LogisticRegression(n_jobs=-1, random_state=25, max_iter=iters, solver = 'saga', penalty = penalty)

    clf_model.fit(np.vstack(X_train), y_train)
    
    val_pred = clf_model.predict(np.vstack(X_test))
    val_proba = clf_model.predict_proba(np.vstack(X_test))
    val_acc = accuracy_score(y_test, val_pred)
    val_roc = roc_auc_score(y_test, val_proba[:, 1])
    val_logloss = log_loss(y_test, val_proba)

    return val_roc

In [20]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=12, n_jobs = -1)

[I 2025-04-27 10:55:33,939] A new study created in memory with name: no-name-a17ed9c7-dd09-4279-8da2-33d6e386743c
[I 2025-04-27 10:55:35,759] Trial 5 finished with value: 0.6166250100782069 and parameters: {'booster': 'LogReg', 'iters': 4741, 'reg': 'l2'}. Best is trial 5 with value: 0.6166250100782069.
[I 2025-04-27 10:55:35,784] Trial 9 finished with value: 0.6166250100782069 and parameters: {'booster': 'LogReg', 'iters': 4859, 'reg': 'l2'}. Best is trial 5 with value: 0.6166250100782069.
[I 2025-04-27 10:55:35,813] Trial 11 finished with value: 0.6166250100782069 and parameters: {'booster': 'LogReg', 'iters': 2198, 'reg': 'l2'}. Best is trial 5 with value: 0.6166250100782069.
[I 2025-04-27 10:55:38,014] Trial 6 finished with value: 0.5865516407320809 and parameters: {'booster': 'LogReg', 'iters': 709, 'reg': 'l1'}. Best is trial 5 with value: 0.6166250100782069.
[I 2025-04-27 10:55:39,166] Trial 0 finished with value: 0.5865516407320809 and parameters: {'booster': 'LogReg', 'iters':

In [21]:
study.best_params

{'booster': 'LogReg', 'iters': 4741, 'reg': 'l2'}

In [22]:
print('Сводная таблица оптимизационных параметров\n')
study.trials_dataframe().sort_values('value', ascending = False)

Сводная таблица оптимизационных параметров



Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_booster,params_depth,params_iters,params_learning_rate,params_n_leaves,params_num_estimators,params_reg,state
9,9,0.616625,2025-04-27 10:55:33.966002,2025-04-27 10:55:35.784931,0 days 00:00:01.818929,LogReg,,4859.0,,,,l2,COMPLETE
11,11,0.616625,2025-04-27 10:55:33.971002,2025-04-27 10:55:35.812944,0 days 00:00:01.841942,LogReg,,2198.0,,,,l2,COMPLETE
5,5,0.616625,2025-04-27 10:55:33.954002,2025-04-27 10:55:35.759844,0 days 00:00:01.805842,LogReg,,4741.0,,,,l2,COMPLETE
0,0,0.586552,2025-04-27 10:55:33.941628,2025-04-27 10:55:39.165819,0 days 00:00:05.224191,LogReg,,1035.0,,,,l1,COMPLETE
1,1,0.586552,2025-04-27 10:55:33.942628,2025-04-27 10:55:39.321148,0 days 00:00:05.378520,LogReg,,4535.0,,,,l1,COMPLETE
6,6,0.586552,2025-04-27 10:55:33.957002,2025-04-27 10:55:38.013174,0 days 00:00:04.056172,LogReg,,709.0,,,,l1,COMPLETE
8,8,0.581633,2025-04-27 10:55:33.963004,2025-04-27 10:58:54.343882,0 days 00:03:20.380878,Catboost,,,0.00317,,1912.0,,COMPLETE
4,4,0.579698,2025-04-27 10:55:33.951003,2025-04-27 10:59:01.420168,0 days 00:03:27.469165,Catboost,,,0.00497,,2027.0,,COMPLETE
3,3,0.577159,2025-04-27 10:55:33.949003,2025-04-27 10:58:23.133238,0 days 00:02:49.184235,Catboost,,,0.00277,,1520.0,,COMPLETE
10,10,0.576272,2025-04-27 10:55:33.969003,2025-04-27 11:00:08.287377,0 days 00:04:34.318374,XGB,13.0,,0.00256,31.0,1865.0,,COMPLETE


In [23]:
study.best_params

{'booster': 'LogReg', 'iters': 4741, 'reg': 'l2'}

In [24]:
clf_model = LogisticRegression(n_jobs=-1, random_state=25, max_iter=study.best_params['iters'], solver = 'saga', penalty = study.best_params['reg'])

clf_model.fit(np.vstack(df['embeds']), df['is_bot'])

In [25]:
# X_test = df_test["text"]
test_proba = clf_model.predict_proba(embeds_t)[:, 1]

preds_df = pd.DataFrame({"ID": df_test["ID"], "is_bot": test_proba})

preds_df.to_csv("preds_log_reg.csv", index=False)