# Supervised Fine Tuning of CodeLlama-13B for Text to Cypher Generation

# Workspace Setup

We install core dependencies with their suggested versions as follows. These are based on working on a single GPU with ampere architecture such as A100 40GB.

In [None]:
#@title Install Core Dependencies

!pip install -U accelerate          # Simplifies running PyTorch models on multi-GPU setups.
!pip install -U bitsandbytes        # Handles model quantization.
!pip install -U datasets            # Utilities to access and handle HF datasets.
!pip install -U einops              # Enables efficient manipulation of tensor operations.
!pip install -U evaluate            # Framework for evaluating model performance.
!pip install -U ninja               # Speeds up building C++ applications.
!pip install -U packaging           # Core utilities for package version management.
!pip install -U peft                # Parameter efficient fine-tuning, used to implement LoRA and QLoRA.
!pip install -U tensorboard         # Package to visualize metrics and monitor model training and performance.
!pip install -U torch               # Core library for deep learning.
!pip install -U transformers        # Tools and utilities to work with transformer based models.
!pip install -U trl                 # Toolkit for reinforcement learning with transformers.

In [2]:
#@title Find torch Version Installed in Google Colab

import torch
print(torch.__version__)

2.3.0+cu121


In [None]:
#@title Install Flash Attention

# Limit the number of jobs to accomodate the compute capabilities
%env MAX_JOBS=2 # for Google Colab

# Install flash attention - for Ampere GPUs
!pip install -U flash-attn --no-build-isolation

In [6]:
#@title Resources Estimation

import torch
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.564 GB.
0.0 GB of memory reserved.


In [7]:
#@title Google Colab Drive Helper

# For Google Colab settings
from google.colab import userdata, drive

# This will prompt for authorization
drive.mount('/content/drive')

# Set the working directory
%cd '/content/drive/MyDrive/finetuneCypher/'

Mounted at /content/drive
/content/drive/MyDrive/finetuneCypher


In [34]:
#@title Path Variables

# Create a path variable for the data folder
data_path = '/content/drive/MyDrive/finetuneCypher/datas/'

# SFT dataset
sft_data_file = 'parametric_trainer_with_repeats.json'

# Create a path variable for the SFT model
sft_model_path = '/content/drive/MyDrive/finetuneCypher/cypherCodeLlama/'

In [33]:
#@title Hugging Face Credentials

# For Hugging Face Hub setting
from huggingface_hub import login

# Upload the HuggingFace token (should have WRITE access) from Colab secrets
HF = userdata.get('HF')

# This is needed to upload the model to HuggingFace
login(token=HF,add_to_git_credential=True)

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Preprocess Datasets

## SFT Dataset

The supervised fine-tuning dataset is expected to have the following format:

`messages: A list of dicts in the form {"role": "{role}", "content": {content}}`.

In [10]:
#@title Load SFT Dataset

import json
with open(data_path+sft_data_file, 'rb') as f:
	sft_sampler = json.load(f)

# Display an entry
sft_sampler[123]

{'Prompt': 'Convert the following question into a Cypher query using the provided graph schema!',
 'Question': 'Fetch the Author nodes and extract their affiliation property!',
 'Schema': 'Graph schema: Relevant node labels and their properties (with datatypes) are:\nAuthor {affiliation: STRING}',
 'Cypher': 'MATCH (n:Author) RETURN n.affiliation'}

In [11]:
#@title Conversational Format Converter

system_message = """
You are a text to Cypher query translator. {prompt}\n{schema}
"""

# Function to transform the data to conversational format {role:, content: }
def create_conversation(sample):
    return {
        "messages": [
            {"role": "system","content": system_message.format(prompt=sample["Prompt"], schema=sample["Schema"])},
            {"role": "user", "content": sample["Question"]},
            {"role": "assistant", "content": sample["Cypher"]}
        ]
    }

In [12]:
#@title Convert Data to HuggingFace Format

from datasets import load_dataset, Dataset

sft_dataset = Dataset.from_list(sft_sampler)
sft_dataset.shuffle()#.select(range(100))

# Transform to the required format
sft_dataset = sft_dataset.map(create_conversation,
                      remove_columns=sft_dataset.features,
                      batched=False)

print(sft_dataset)

Map:   0%|          | 0/30116 [00:00<?, ? examples/s]

Dataset({
    features: ['messages'],
    num_rows: 30116
})


In [13]:
#@title Split Data into Train and Test Sets

sft_dataset = sft_dataset.train_test_split(test_size=10/100)

print(sft_dataset)

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 27104
    })
    test: Dataset({
        features: ['messages'],
        num_rows: 3012
    })
})


In [14]:
#@title Display a sample

sft_dataset["train"][3]["messages"]

[{'content': "\nYou are a text to Cypher query translator. Convert the following question into a Cypher query using the provided graph schema!\nGraph schema: Relevant node labels and their properties  are:\nArticle\nKeyword\n\nRelevant relationships are:\n{'start': Article, 'type': HAS_KEY, 'end': Keyword }\n",
  'role': 'system'},
 {'content': ' Which nodes are connected to Article, but not to Keyword via HAS_KEY?',
  'role': 'user'},
 {'content': 'MATCH (c:Article)-[r]-(n) WHERE NOT (n)-[:HAS_KEY]-(:Keyword) RETURN labels(n)',
  'role': 'assistant'}]

In [15]:
#@title  Save SFT Preprocessed Data

sft_dataset["train"].to_json(data_path+"sft_train_dataset.json", orient="records")
sft_dataset["test"].to_json(data_path+"sft_test_dataset.json", orient="records")

Creating json from Arrow format:   0%|          | 0/28 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

2215660

# Run SFT

In [16]:
#@title Load Preprocessed Data

from datasets import load_dataset

# Load jsonl data from disk
sft_train_dataset = load_dataset("json",
                       data_files=data_path+"sft_train_dataset.json",
                       split="train")
sft_test_dataset = load_dataset("json",
                       data_files=data_path+"sft_test_dataset.json",
                       split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [17]:
#@title The Base Model

model_name_or_path =  "codellama/CodeLlama-13b-hf"

In [18]:
#@title Load Tokenizer

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path,
                                          trust_remode_code=True)
# Choose padding side "right" to prevent warnings
tokenizer.padding_side = 'right'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [19]:
#@title Quantization Parameters

from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)


In [20]:
#@title Device Map

device_map = {"": torch.cuda.current_device()} if torch.cuda.is_available() else None

In [21]:
#@title Load Base Model

from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path,
    revision="main",
    device_map=device_map,
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/31.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [22]:
#@title Set Chat Template

from trl import setup_chat_format

# Use the chat template ChatML by OpenAi builder from Hugging Face
model, tokenizer = setup_chat_format(model, tokenizer)

In [24]:
#@title LoRA Configuration

from peft import LoraConfig

peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.05,
        r=8,
        bias="none",
        target_modules=["q_proj", "o_proj", "gate_proj", "up_proj", "down_proj",
                        "k_proj", "v_proj"], # apply LoRA to all linear layers
        task_type="CAUSAL_LM",
)

In [27]:
# @title Training Arguments

from transformers import TrainingArguments

# Adapted from  Phil Schmid blogpost
args = TrainingArguments(
    output_dir=sft_model_path,              # directory to save the model and repository id
    num_train_epochs=1,                     # number of training epochs
    per_device_train_batch_size=8,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory, use in distributed training
    optim="adamw_8bit",                     # use paged_adamw_8bit if not enough memory
    logging_steps=100,                      # log every 100 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    bf16=True,                              # use bfloat16 precision
    tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to Hugging Face hub
    hub_model_id="codellama-13b-sft-qlora-cypher",
    report_to="tensorboard",               # report metrics to tensorboard
)

In [29]:
# @title Initialize the SFTTrainer

from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=sft_train_dataset,
    peft_config=peft_config,
    max_seq_length=512,
    tokenizer=tokenizer,
    dataset_kwargs={
        "add_special_tokens": False,  # the template adds the special tokens
        "append_concat_token": False, # no need to add additional separator token
    }
)

Map:   0%|          | 0/27104 [00:00<?, ? examples/s]

In [30]:
#@title Train the Model

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.bfloat16.


Step,Training Loss
100,0.4923
200,0.1519
300,0.0827
400,0.0678
500,0.0638
600,0.0578
700,0.0572


Step,Training Loss
100,0.4923
200,0.1519
300,0.0827
400,0.0678
500,0.0638
600,0.0578
700,0.0572
800,0.0561




TrainOutput(global_step=847, training_loss=0.12463992489270928, metrics={'train_runtime': 8000.4263, 'train_samples_per_second': 3.388, 'train_steps_per_second': 0.106, 'total_flos': 6.629534769905664e+17, 'train_loss': 0.12463992489270928, 'epoch': 1.0})

In [None]:
#@title Save Model Locally

trainer.save_model()

# Inference Tests

In [None]:
#@title Clear Memory
import gc
#del trainer
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

In [None]:
#@title Load Peft Model

from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer, pipeline
import torch

# HF model
peft_model_id = "solanaO/codellama-13b-sft-qlora-cypher"
# Local model
#peft_model_id = sft_model_path

# Load Model with PEFT adapter
model = AutoPeftModelForCausalLM.from_pretrained(
  peft_model_id,
  device_map="auto",
  #torch_dtype=torch.float16,
  load_in_4bit=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/31.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/9.90G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.44k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

In [None]:
#@title Load Tokenizer

tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(32018, 5120)

In [None]:
# Text generation pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCa

In [None]:
#@title Load Test Data

from datasets import load_dataset
from random import randint

# Load jsonl data from disk
eval_dataset = load_dataset("json", data_files=data_path+"sft_test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
#@title One Sample Test

prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2],
                                            tokenize=False,
                                            add_generation_prompt=True)
print(prompt)

<|im_start|>system

You are a text to Cypher query translator. Convert the following question into a Cypher query using the provided graph schema!
Graph schema: Node properties are the following:
Article {abstract, article_id, comments, title}
The relationships are the following:
(:Article)-[:HAS_KEY]->(:Keyword),(:Article)-[:HAS_DOI]->(:DOI),(:Article)-[:HAS_CATEGORY]->(:Categories),(:Article)-[:WRITTEN_BY]->(:Author),(:Article)-[:UPDATED]->(:UpdateDate),(:Article)-[:PUBLISHED_IN]->(:Journal),(:Article)-[:HAS_REPORT]->(:Report)
<|im_end|>
<|im_start|>user
Retrieve distinct values of the title and the comments from Article where title is not The Gervais-Neveu-Felder equation for the Jordanian quasi-Hopf
  U_{h;y}(sl(2)) algebra!<|im_end|>
<|im_start|>assistant



In [None]:
outputs = pipe(prompt,
              max_new_tokens=150,
              do_sample=True,
              temperature=0.1,
              top_k=50,
              top_p=0.1,
              eos_token_id=pipe.tokenizer.eos_token_id,
              pad_token_id=pipe.tokenizer.pad_token_id
              )

In [None]:
print(f"Question: {eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Corect Cypher: {eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Cypher: {outputs[0]['generated_text'][len(prompt):].strip()}")

Question: Retrieve distinct values of the title and the comments from Article where title is not The Gervais-Neveu-Felder equation for the Jordanian quasi-Hopf
  U_{h;y}(sl(2)) algebra!
Corect Cypher: MATCH (n:Article) WHERE n.title <> 'The Gervais-Neveu-Felder equation for the Jordanian quasi-Hopf
  U_{h;y}(sl(2)) algebra' RETURN DISTINCT n.title AS title, n.comments AS comments
Generated Cypher: MATCH (n:Article) WHERE n.title <> 'The Gervais-Neveu-Felder equation for the Jordanian quasi-Hopf
  U_{h;y}(sl(2)) algebra' RETURN DISTINCT n.title AS title, n.comments AS comments


In [None]:
#@title Test on 100 Samples

from tqdm import tqdm

def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2],
                                                tokenize=False,
                                                add_generation_prompt=True)
    outputs = pipe(prompt,
                   max_new_tokens=256,
                   do_sample=True,
                   temperature=0.7,
                   top_k=50,
                   top_p=0.95,
                   eos_token_id=pipe.tokenizer.eos_token_id,
                   pad_token_id=pipe.tokenizer.pad_token_id)

    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()

    if predicted_answer == sample["messages"][2]["content"]:
        return 1
    else:
        return 0

success_rate = []
number_of_eval_samples = 100

# Iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")

100%|██████████| 100/100 [14:52<00:00,  8.93s/it]

Accuracy: 86.00%



