In [79]:
import pandas as pd
import re

from typing import Literal
import ast
import numpy as np
import sys
# from pympler import asizeof

In [2]:
import tensorflow as tf
from transformers import AutoTokenizer



In [7]:
source_df = pd.read_csv("data/llm-classification-finetuning/train.csv")

In [8]:
source_df = source_df.sample(10000, ignore_index=True)

In [9]:
source_df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,2339905997,alpaca-13b,RWKV-4-Raven-14B,"[""Write code in C to print 'Hello world!\""""]","[""#include<stdio.h>\nint main() {\nprintf(\""He...","[""#include <stdio.h>\nint main() {\n printf...",0,1,0
1,2648237572,claude-1,claude-2.0,"[""Emily's voice quivered with excitement, \""Do...","[""I apologize, but I do not feel comfortable d...","[""I apologize, but I do not feel comfortable r...",0,1,0
2,3443365547,vicuna-13b,gpt-4-0613,"[""what is heavier, 10kg of feathers or 1 kg of...","[""Both are the same weight, as they both have ...","[""10kg of feathers is heavier than 1kg of lead.""]",0,1,0
3,2957381608,gpt-4-0613,llama-2-70b-chat,"[""What is the longest first name in history?"",...","[""The longest first name in history is believe...","[""The longest first name in history is a matte...",0,0,1
4,2673925671,vicuna-7b,chatglm2-6b,"[""for the following reaction, 0.410 moles of c...","[""To determine the formula for the limiting re...","[""The limiting reactant is the substance that ...",1,0,0
...,...,...,...,...,...,...,...,...,...
9995,3749118402,claude-2.1,claude-instant-1,"[""I need your help in survival with little mon...","[""Here are some suggestions for surviving with...","[""Here are some tips for surviving on a very l...",0,0,1
9996,704185085,codellama-34b-instruct,gpt-3.5-turbo-0613,"[""Hi! Please what is the onomatopoeia for the ...","[""The onomatopoeia for the sound of a body eme...","[""The onomatopoeia for the sound of a body eme...",0,1,0
9997,998594060,gpt-4-0613,gpt-3.5-turbo-1106,"[""Can you find me the top five front end build...","[""Sure, here are some of the top front-end bui...","[""| Front End Builder | Drag and Drop | Beginn...",0,0,1
9998,985925841,vicuna-33b,chatglm2-6b,"[""Top measures of goodness of fit for supervis...","[""In supervised regression, goodness of fit me...","[""There are several measures of the goodness o...",1,0,0


In [10]:
round(source_df.memory_usage(deep=True).sum() / 1024 / 1024, 2)

32.98

#### Make each sample as "[role user token] prompt [role model token] answer"

In [11]:
source_df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,2339905997,alpaca-13b,RWKV-4-Raven-14B,"[""Write code in C to print 'Hello world!\""""]","[""#include<stdio.h>\nint main() {\nprintf(\""He...","[""#include <stdio.h>\nint main() {\n printf...",0,1,0
1,2648237572,claude-1,claude-2.0,"[""Emily's voice quivered with excitement, \""Do...","[""I apologize, but I do not feel comfortable d...","[""I apologize, but I do not feel comfortable r...",0,1,0
2,3443365547,vicuna-13b,gpt-4-0613,"[""what is heavier, 10kg of feathers or 1 kg of...","[""Both are the same weight, as they both have ...","[""10kg of feathers is heavier than 1kg of lead.""]",0,1,0
3,2957381608,gpt-4-0613,llama-2-70b-chat,"[""What is the longest first name in history?"",...","[""The longest first name in history is believe...","[""The longest first name in history is a matte...",0,0,1
4,2673925671,vicuna-7b,chatglm2-6b,"[""for the following reaction, 0.410 moles of c...","[""To determine the formula for the limiting re...","[""The limiting reactant is the substance that ...",1,0,0
...,...,...,...,...,...,...,...,...,...
9995,3749118402,claude-2.1,claude-instant-1,"[""I need your help in survival with little mon...","[""Here are some suggestions for surviving with...","[""Here are some tips for surviving on a very l...",0,0,1
9996,704185085,codellama-34b-instruct,gpt-3.5-turbo-0613,"[""Hi! Please what is the onomatopoeia for the ...","[""The onomatopoeia for the sound of a body eme...","[""The onomatopoeia for the sound of a body eme...",0,1,0
9997,998594060,gpt-4-0613,gpt-3.5-turbo-1106,"[""Can you find me the top five front end build...","[""Sure, here are some of the top front-end bui...","[""| Front End Builder | Drag and Drop | Beginn...",0,0,1
9998,985925841,vicuna-33b,chatglm2-6b,"[""Top measures of goodness of fit for supervis...","[""In supervised regression, goodness of fit me...","[""There are several measures of the goodness o...",1,0,0


In [12]:
def split_text_re(text):
    text = text.strip("[]")
    matches = re.findall(r'"([^"]*)"', text)
    return matches

def split_text(text):
    val = ast.literal_eval(text)
    if not isinstance(val, list):
        raise ValueError("text must be a list")
    return val

def add_tech_tokens(text: list, token_type: str):
    if token_type not in ["user", "model"]:
        raise ValueError("token_type must be either user or model")
    minor_token = "<SUBTURN> "
    role = "<USR> " if token_type == "user" else "<RESPONSE> "
    text = " ".join([minor_token+t for t in text])
    text = role+text
    return text

def process_text(text: str, text_type: Literal["user", "model"]):
    try:
        parts = split_text(text)
    except ValueError:
        parts = split_text(text.replace("null", "\"\""))
    except Exception:
        parts = split_text_re(text)
    return add_tech_tokens(parts, token_type=text_type)

def modify_train(df: pd.DataFrame):
    df = df.copy()
    df["prompt"] = df["prompt"].apply(lambda x: process_text(x, "user"))
    df["response_a"] = df["response_a"].apply(lambda x: process_text(x, "model"))
    df["response_b"] = df["response_b"].apply(lambda x: process_text(x, "model"))
    return df


In [13]:
# r = split_text(source_df.loc[12, 'prompt'])
# add_tech_tokens(r, "user")


In [74]:
new_df = modify_train(source_df)

In [75]:
round(new_df.memory_usage(deep=True).sum() / 1024 / 1024, 2)

36.97

In [76]:
new_df["response_a"] = new_df["prompt"] + " " + new_df["response_a"]
new_df["response_a"] = new_df["prompt"] + " " + new_df["response_a"]
new_df.rename(columns={"response_a": "prompt_response_a", "response_b": "prompt_response_b"}, inplace=True)
new_df.drop(columns=["prompt"], inplace=True)


In [77]:
new_df[['winner_model_a', 'winner_model_b', 'winner_tie']].drop_duplicates()

Unnamed: 0,winner_model_a,winner_model_b,winner_tie
0,0,1,0
3,0,0,1
4,1,0,0


In [50]:
# new_df['winner_model_a'] =\
#     new_df.apply(lambda row: row['winner_tie'] if row['winner_tie'] else row['winner_model_a'], axis=1)
#
# new_df['winner_model_b'] =\
#     new_df.apply(lambda row: row['winner_tie'] if row['winner_tie'] else row['winner_model_b'], axis=1)

In [78]:
new_df

Unnamed: 0,id,model_a,model_b,prompt_response_a,prompt_response_b,winner_model_a,winner_model_b,winner_tie
0,2339905997,alpaca-13b,RWKV-4-Raven-14B,<USR> <SUBTURN> Write code in C to print 'Hell...,<RESPONSE> <SUBTURN> #include <stdio.h>\nint m...,0,1,0
1,2648237572,claude-1,claude-2.0,<USR> <SUBTURN> Emily's voice quivered with ex...,"<RESPONSE> <SUBTURN> I apologize, but I do not...",0,1,0
2,3443365547,vicuna-13b,gpt-4-0613,"<USR> <SUBTURN> what is heavier, 10kg of feath...",<RESPONSE> <SUBTURN> 10kg of feathers is heavi...,0,1,0
3,2957381608,gpt-4-0613,llama-2-70b-chat,<USR> <SUBTURN> What is the longest first name...,<RESPONSE> <SUBTURN> The longest first name in...,0,0,1
4,2673925671,vicuna-7b,chatglm2-6b,"<USR> <SUBTURN> for the following reaction, 0....",<RESPONSE> <SUBTURN> The limiting reactant is ...,1,0,0
...,...,...,...,...,...,...,...,...
9995,3749118402,claude-2.1,claude-instant-1,<USR> <SUBTURN> I need your help in survival w...,<RESPONSE> <SUBTURN> Here are some tips for su...,0,0,1
9996,704185085,codellama-34b-instruct,gpt-3.5-turbo-0613,<USR> <SUBTURN> Hi! Please what is the onomato...,<RESPONSE> <SUBTURN> The onomatopoeia for the ...,0,1,0
9997,998594060,gpt-4-0613,gpt-3.5-turbo-1106,<USR> <SUBTURN> Can you find me the top five f...,<RESPONSE> <SUBTURN> | Front End Builder | Dra...,0,0,1
9998,985925841,vicuna-33b,chatglm2-6b,<USR> <SUBTURN> Top measures of goodness of fi...,<RESPONSE> <SUBTURN> There are several measure...,1,0,0


#### Tokenize samples

In [41]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [42]:
special_tokens = ["<SUBTURN>", "<USR>", "<RESPONSE>"]

tokenizer.add_special_tokens({
    "additional_special_tokens": special_tokens
})

3

In [61]:
def tokenize_process_column(col: pd.Series):
    data = (
        col
        .astype(str)
        .apply(lambda s: s.encode("utf-8", "ignore").decode("utf-8"))
    )
    data = data.tolist()

    encoded_tokens = tokenizer(data,
        return_attention_mask=True,
        truncation=True,
        max_length=512,
        padding=True,
        return_tensors="np"
        )
    return encoded_tokens


In [62]:
encA = tokenize_process_column(new_df['prompt_response_a'])
encB = tokenize_process_column(new_df['prompt_response_b'])


In [28]:
tokenizer.vocab_size

30522

In [29]:
tokenizer.get_added_vocab()

{'[PAD]': 0,
 '[UNK]': 100,
 '[CLS]': 101,
 '[SEP]': 102,
 '[MASK]': 103,
 '<SUBTURN>': 30522,
 '<USR>': 30523,
 '<RESPONSE>': 30524}

In [30]:
tokenizer.get_vocab()["<USR>"]

30523

In [31]:
tokenizer.vocab_size

30522

In [31]:
vocab_size = tokenizer.vocab_size
embed_dim=128
embedding = tf.keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embed_dim
)

In [68]:
seq_length = encA["input_ids"].shape[1]

ids_a  = tf.keras.Input(shape=(seq_length,), dtype=tf.int32, name="ids_a")
mask_a = tf.keras.Input(shape=(seq_length,), dtype=tf.int32, name="mask_a")

ids_b  = tf.keras.Input(shape=(seq_length,), dtype=tf.int32, name="ids_b")
mask_b = tf.keras.Input(shape=(seq_length,), dtype=tf.int32, name="mask_b")


In [69]:
vocab_size = tokenizer.vocab_size
embed_dim=128
embedding = tf.keras.layers.Embedding(vocab_size, embed_dim, name="emb")

lstm_units = 128
shared_lstm = tf.keras.layers.LSTM(lstm_units, name="shared_lstm")

2026-02-13 17:52:40.814723: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-02-13 17:52:41.022760: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2026-02-13 17:52:41.023143: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [70]:
def shared_pipeline(ids, mask):
    x = embedding(ids)
    m = tf.cast(mask, tf.bool)
    h = shared_lstm(x, mask=m)
    return h

In [71]:
shared_output_a, shared_output_b, = shared_pipeline(ids_a, mask_a), shared_pipeline(ids_b, mask_b)

feat = tf.keras.layers.Concatenate(name="concat")([shared_output_a, shared_output_b])

2026-02-13 18:14:53.555969: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


In [72]:
x = tf.keras.layers.Dense(128, activation="relu")(feat)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(3, activation="softmax", name="y")(x)

In [73]:
model = tf.keras.Model(inputs=[ids_a, mask_a, ids_b, mask_b], outputs=out)

In [80]:
y = new_df[["winner_model_a", "winner_model_b", "winner_tie"]].to_numpy(dtype=np.float32)

In [82]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

X = {
    "ids_a": encA["input_ids"],
    "mask_a": encA["attention_mask"],
    "ids_b": encB["input_ids"],
    "mask_b": encB["attention_mask"],
}


In [83]:
model.fit(X, y, batch_size=32, epochs=8, validation_split=0.1, shuffle=True)

<keras.src.callbacks.History at 0x7fdb4c5214d0>