# Testing workbook. Finetuning mistral 7B on single line python traces


In [None]:
! pip install -q kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d frasergreenlee/python-state-changes
! unzip "python-state-changes.zip"

In [None]:
# convert txt to json format for training
import json


def process_file(input_file_path, output_file_path):
    """some formatting issues in kaggle dataset. this fixes it."""
    with open(input_file_path, "r") as file:
        lines = file.readlines()

    processed_lines = []
    for i, line in enumerate(lines):
        if not line.startswith("state:") and i > 0:
            processed_lines[-1] = processed_lines[-1].strip() + " "
        processed_lines.append(line)

    with open(output_file_path, "w") as file:
        file.writelines(processed_lines)


def convert_txt_to_json(txt_file_path, json_file_path):
    with open(txt_file_path, "r") as txt_file:
        lines = txt_file.readlines()
        data = []
        example = 0
        for line in lines:
            if (
                line == "\n"
            ):  # first line is a newline, so skip. probably a better way to do this...
                continue
            # print(line.split("; output:"))
            # return
            state_code, output = line.split("; output:")
            output = output.strip()
            input_text = state_code.strip()
            data.append(
                {"input": input_text, "output": output, "example": example},
            )
            example += 1

    with open(json_file_path, "w") as json_file:
        json.dump(data, json_file, indent=4)


# Specify the paths for your TXT and JSON files
# TODO convert this to absolute filepaths if necessary
txt_file_path = "./new_all_states.txt"
txt_file_path2 = "./new_all_states_clean.txt"
json_file_path = "./python_states_singleline.json"

process_file(txt_file_path, txt_file_path2)
convert_txt_to_json(txt_file_path2, json_file_path)

# Tokenizing


In [None]:
! pip install transformers datasets

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
# Must install separately since Colab has torch 2.2.1, which breaks packages
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    # Use this for new GPUs like Ampere, Hopper GPUs (RTX 30xx, RTX 40xx, A100, H100, L40)
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # Use this for older GPUs (V100, Tesla T4, RTX 20xx)
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [1]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",  # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-bnb-4bit",  # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


  from .autonotebook import tqdm as notebook_tqdm


==((====))==  Unsloth: Fast Mistral patching release 2024.3
   \\   /|    GPU: NVIDIA RTX A6000. Max memory: 47.536 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. Xformers = 0.0.25. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    use_gradient_checkpointing=True,
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth 2024.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
json_file_path = "./python_states_singleline.json"

trace_prompt = """<s> [INST] Below is an input which contains the state of variables and code that acts upon these variables or not. Given the state and the code give the state after the code executes for each variable. Be very careful. You should clearly outline your intermediate steps and your final answer should be a newline with exactly the variables and their values. Here is the State and Code. {}
Now generate the final state for each variable. Generate intermediate outputs. [/INST] {}</s>"""

# maybe add an actual example to the prompt that shows exactly how to outline the traces logically. Maybe not since we are doing math as the final goal.
# worth trying though

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(examples):
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        # EOS token should be the </s>. If generation bugs check here first
        text = trace_prompt.format(
            input, output
        )  # + EOS_TOKEN # I think that this is extra. try and see, maybe take out maybe leave in
        texts.append(text)
        # print(text) # had this uncommented:
        """<s>[INST] Below is an input which contains the state of variables and code that acts upon these variables or not.
        Given the state and the code give the state after the code executes for each variable. Be very careful.
        You should clearly outline your intermediate steps and your final answer should be a newline with exactly the variables and their values.
        Here is the State and Code. state: p = 1; code: t = set([p])
        Now generate the final state for each variable. Generate intermediate outputs.[/INST]p = 1; t = {1};</s> """
        # looks find I think
    return {
        "text": texts,
    }

from datasets import load_dataset



In [4]:
from trl import SFTTrainer
from transformers import TrainingArguments

dataset = load_dataset("json", data_files=json_file_path, split="train")
dataset = dataset.train_test_split(test_size=0.1)["test"]  # comment out later
dataset = dataset.map(
    formatting_prompts_func,
    num_proc=16,
    batched=True,
)

# from datetime import datetime
# t1 = datetime.now()
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # steps because epochs take forever
        max_steps = 60,
        # num_train_epochs=5,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        evaluation_strategy="steps",
        do_eval=True,
        eval_accumulation_steps=64,
    ),
    # compute_metrics=custom_metrics # add in later cuz of a bug
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
from ast import literal_eval

def str_to_objs(input_str: str):
    """
    converts a string to a list of objects
    input_str: str
        Must be a string of the form 'i = 4; p = [0, 1, 1, 2, 5];'
    returns: list
    """
    items = [x.strip() for x in input_str.split(";")]
    objs = dict()
    for item in items:
        try:
            key, value_str = item.split("=")
            key = key.strip()
        except:
            continue # no key found
        try:
            value_str = value_str.strip()

            # quick and dirty object conversion to make all literals hashable
            # TODO: create a hashable class for dicts and lists
            literal_value = literal_eval(value_str) # safe eval
            value = make_hashable(literal_value)
            objs[key] = value
        except ValueError:
            objs[key] = "NONLITERAL_STRING"
    return objs

def make_hashable(obj):
    if isinstance(obj, dict):
        # Convert dict to a sorted tuple of key-value pairs, making keys/values hashable recursively
        return tuple(sorted((k, make_hashable(v)) for k, v in obj.items()))
    if isinstance(obj, set):
        return frozenset(obj)
    if isinstance(obj, list):
        return tuple(obj)
    # recursion case
    elif isinstance(obj, list):
        # Convert lists to tuples
        return tuple(make_hashable(item) for item in obj)
    else:
        # Assume the object is hashable (e.g., numbers, strings, tuples)
        return obj

In [6]:
def custom_metrics(preds):
    logits = torch.tensor(preds.predictions)
    labels = torch.tensor(preds.label_ids)
    batch_size, seq_length, vocab_size = logits.shape

    # steal from inside llama
    # shift logits by 1 index cuz of causal lm
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    # Flatten the tokens
    # loss_fct = CrossEntropyLoss()
    shift_logits = shift_logits.view(batch_size, -1, vocab_size)
    shift_labels = shift_labels.view(batch_size, -1)


    probs = torch.nn.functional.softmax(shift_logits.view(-1, vocab_size), dim=-1)
    p_true_tokens = probs.view(-1, vocab_size)[
        torch.arange(batch_size * (seq_length-1)), shift_labels.view(-1)
    ].view(batch_size, (seq_length-1))

    nll = -torch.log(p_true_tokens)
    mean_nll = nll.mean()
    ppl = torch.exp(mean_nll)

    # compute percentage of correct tokens
    correct_tokens = (shift_logits.view(-1, vocab_size).argmax(-1) == shift_labels.view(-1)).float().mean()

    pred_max_labels = shift_logits.argmax(-1).view(batch_size, -1)
    f1s = []
    for i in range(batch_size):
        unmasked_label_tokens = shift_labels[i][shift_labels[i] != -100][:-1] # drop eos_token
        # find the index where the instruction token ends and the answer begins
        inst_token_seq = tokenizer.encode("[/INST]", return_tensors="pt")[0][1:]
        first_output_idx = None
        for j in range(unmasked_label_tokens.shape[0] - len(inst_token_seq)):
            if torch.equal(unmasked_label_tokens[j:j+len(inst_token_seq)], inst_token_seq):
                first_output_idx = j + len(inst_token_seq) 
                break
        assert first_output_idx is not None, "Could not find the end of the instruction token"

        # get ground truth output tokens
        gt_output_tokens = unmasked_label_tokens[first_output_idx:]
        # get predicted output tokens (including padding)
        pred_output_tokens_masked = pred_max_labels[i][first_output_idx:]
        # drop the pad tokens 
        pred_output_tokens_unmasked = pred_output_tokens_masked[pred_output_tokens_masked != -100]
        first_pred_output_stop_idx = torch.where(pred_output_tokens_unmasked == tokenizer.eos_token_id)[0][0]
        pred_output_tokens = pred_output_tokens_unmasked[:first_pred_output_stop_idx]

        gt_output_str = tokenizer.decode(gt_output_tokens)
        pred_output_str = tokenizer.decode(pred_output_tokens)

        # compare gt/preds interpreted in python
        gt_state = str_to_objs(gt_output_str)
        pred_state = str_to_objs(pred_output_str)
        # compute f1 for values in the two states
        gt_vars = set(gt_state.items())
        pred_vars = set(pred_state.items())
        try:
            precision = len(gt_vars.intersection(pred_vars)) / len(pred_vars)
            recall = len(gt_vars.intersection(pred_vars)) / len(gt_vars)
            f1 = 2 * precision * recall / (precision + recall)
        except ZeroDivisionError:
            f1 = 0
        f1s.append(f1)
    f1_mean = torch.tensor(f1s).mean().item()
    return {"perplexity": ppl, "correct_tokens": correct_tokens.item(), "f1": f1_mean}


model.to("cuda")
trainer.compute_metrics = custom_metrics
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 896,890 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss,Validation Loss


In [None]:
xx=TrainingArguments(output_dir="./")
xx.per_device_eval_batch_size

In [None]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference
inputs = tokenizer(
    [
        trace_prompt.format(
            "state: p = {'a': 1, 'b': 1, 'c': 1, 'd': 1}; w = 'e'; code: p[w] = 1",  # input
            "",  # output --> "a = [1, 7, 2, 4]; c = {0: 0, 1: 1, 2: 0}; i = 0; k = 3;"
        )
    ],
    return_tensors="pt",
).to("cuda")

# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# tokenizer.batch_decode(outputs)
from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

In [None]:
# Need to save the model
# TODO