In [16]:
import pandas as pd
import re

from typing import Literal
import ast
import sys
from pympler import asizeof

In [2]:
import tensorflow as tf
from transformers import AutoTokenizer



In [3]:
source_df = pd.read_csv("data/llm-classification-finetuning/train.csv")

In [4]:
round(source_df.memory_usage(deep=True).sum() / 1024 / 1024, 2)

190.7

#### Make each sample as "[role user token] prompt [role model token] answer"

In [5]:
source_df

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0
...,...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"[""A simple mnemonic for \u03c0:\n\""How I wish ...","[""Sure, let's break it down:\n\n1. \""How\"" has...","[""Here is how that mnemonic represents the dig...",1,0,0
57473,4294692063,claude-2.0,llama-2-13b-chat,"[""In python, implement a naive Bayes with gaus...","[""Here is an implementation of a naive Bayes c...","[""Sure! Here's an implementation of a naive Ba...",1,0,0
57474,4294710549,claude-1,alpaca-13b,"[""is it unethical to work on building weapons?...","[""Working on weapons technology raises some et...","[""It depends on the context. Weapons can be us...",1,0,0
57475,4294899228,palm-2,tulu-2-dpo-70b,"[""If a bait contains 0,0025% bromadiolon then ...","[""Bromadiolone is a rodenticide which is most ...","[""As an AI language model, I do not promote or...",0,1,0


In [6]:
def split_text_re(text):
    text = text.strip("[]")
    matches = re.findall(r'"([^"]*)"', text)
    return matches

def split_text(text):
    val = ast.literal_eval(text)
    if not isinstance(val, list):
        raise ValueError("text must be a list")
    return val

def add_tech_tokens(text: list, token_type: str):
    if token_type not in ["user", "model"]:
        raise ValueError("token_type must be either user or model")
    minor_token = "<SUBTURN> "
    role = "<USR> " if token_type == "user" else "<RESPONSE> "
    text = " ".join([minor_token+t for t in text])
    text = role+text
    return text

def process_text(text: str, text_type: Literal["user", "model"]):
    try:
        parts = split_text(text)
    except ValueError:
        parts = split_text(text.replace("null", "\"\""))
    except Exception:
        parts = split_text_re(text)
    return add_tech_tokens(parts, token_type=text_type)

def modify_train(df: pd.DataFrame):
    df = df.copy()
    df["prompt"] = df["prompt"].apply(lambda x: process_text(x, "user"))
    df["response_a"] = df["response_a"].apply(lambda x: process_text(x, "model"))
    df["response_b"] = df["response_b"].apply(lambda x: process_text(x, "model"))
    return df


In [7]:
# r = split_text(source_df.loc[12, 'prompt'])
# add_tech_tokens(r, "user")


In [8]:
new_df = modify_train(source_df)

In [9]:
round(new_df.memory_usage(deep=True).sum() / 1024 / 1024, 2)

213.34

In [10]:
new_df["response_a"] = new_df["prompt"] + " " + new_df["response_a"]
new_df["response_a"] = new_df["prompt"] + " " + new_df["response_a"]
new_df.rename(columns={"response_a": "prompt_response_a", "response_b": "prompt_response_b"}, inplace=True)
new_df.drop(columns=["prompt"], inplace=True)


In [11]:
new_df

Unnamed: 0,id,model_a,model_b,prompt_response_a,prompt_response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,<USR> <SUBTURN> Is it morally right to try to ...,"<RESPONSE> <SUBTURN> As an AI, I don't have pe...",1,0,0
1,53567,koala-13b,gpt-4-0613,<USR> <SUBTURN> What is the difference between...,<RESPONSE> <SUBTURN> A marriage license and a ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,<USR> <SUBTURN> explain function calling. how ...,<RESPONSE> <SUBTURN> Function calling is the p...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,<USR> <SUBTURN> How can I create a test set fo...,<RESPONSE> <SUBTURN> When building a classifie...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,<USR> <SUBTURN> What is the best way to travel...,<RESPONSE> <SUBTURN> The best way to travel fr...,0,1,0
...,...,...,...,...,...,...,...,...
57472,4294656694,gpt-4-0613,claude-1,"<USR> <SUBTURN> A simple mnemonic for π:\n""How...",<RESPONSE> <SUBTURN> Here is how that mnemonic...,1,0,0
57473,4294692063,claude-2.0,llama-2-13b-chat,"<USR> <SUBTURN> In python, implement a naive B...",<RESPONSE> <SUBTURN> Sure! Here's an implement...,1,0,0
57474,4294710549,claude-1,alpaca-13b,<USR> <SUBTURN> is it unethical to work on bui...,<RESPONSE> <SUBTURN> It depends on the context...,1,0,0
57475,4294899228,palm-2,tulu-2-dpo-70b,"<USR> <SUBTURN> If a bait contains 0,0025% bro...","<RESPONSE> <SUBTURN> As an AI language model, ...",0,1,0


#### Tokenize samples

In [12]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [13]:
special_tokens = ["<SUBTURN>", "<USR>", "<RESPONSE>"]

tokenizer.add_special_tokens({
    "additional_special_tokens": special_tokens
})

3

In [14]:
data_for_tokens = pd.concat([new_df['prompt_response_a'], new_df['prompt_response_b']],
                            ignore_index=True)

data_for_tokens = (
    data_for_tokens
    .astype(str)
    .apply(lambda s: s.encode("utf-8", "ignore").decode("utf-8"))
)

# print(data_for_tokens.memory_usage(deep=True) / 1024 / 1024)
data_for_tokens = data_for_tokens.tolist()

In [28]:
len(data_for_tokens)
test_data_for_tokens = data_for_tokens[:25000]

In [29]:
enc = tokenizer(test_data_for_tokens,
                return_attention_mask=True,
                truncation=True,
                max_length=512,
                padding=True,
                return_tensors="np"
                )



In [30]:
len(test_data_for_tokens)

25000

In [31]:
len("".join(test_data_for_tokens))

52562107

In [34]:
asizeof.asizeof(test_data_for_tokens) / 1024 / 1024

77.64669036865234

In [35]:
enc['input_ids'].nbytes / 1024 / 1024

97.65625

In [36]:
enc['attention_mask'].nbytes / 1024 / 1024


97.65625

In [None]:
print(enc['input_ids'])