In [1]:
import sys, subprocess, os

REPO_URL = 'https://github.com/pelmers/llms-for-code-comment-consistency.git'
WANDB_KEY = 'zzz'

# Folder of train.json, valid.json, test.json
DATA_FOLDER = 'data/deepjit_summary_data'

LLM_PROMPT = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
LLM_INSTRUCTION = 'Does the following Comment correctly and fully summarize the Code? Respond Yes or No.'
LLM_INPUT = 'Comment: {comment}\nCode: {code}'
LLM_OUTPUT_0 = 'Yes.'
LLM_OUTPUT_1 = 'No.'

# Llama model parameters
BASE_MODEL = 'decapoda-research/llama-7b-hf'
# other choices: mosaicml/mpt-7b[-instruct], replit/replit-code-v1-3b
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
# In diff mode, the code is given as a diff from the previous version
USE_DIFF_MODE = False

# Training Parameters
BATCH_SIZE = 4
MAX_STEPS = 1000
EVAL_STEPS = MAX_STEPS // 5
WARMUP_STEPS = 50
MAX_LEN = 768
LEARNING_RATE = 1e-4
# Directory to load model, only used when MAX_STEPS is 0
LOAD_DIR = None

DEBUG = True

# After training on DATA_FOLDER, also evaluate on ADDITIONAL_TEST_FILES
ADDITIONAL_TEST_FILES = []

# If running as a script, allow os.environ to overwrite these options
if __name__ == '__main__':
    import os
    for k, v in os.environ.items():
        if k in globals():
            # First check if v is a boolean or a number and convert to the right type
            if v.lower() == 'true':
                v = True
            elif v.lower() == 'false':
                v = False
            elif v.isnumeric():
                v = int(v)
            # Or a float
            elif '.' in v and v.replace('.', '').isnumeric():
                v = float(v)
            # Or a list
            elif v.startswith('[') and v.endswith(']'):
                v = v[1:-1].split(',')
                v = [a.strip() for a in v]
            globals()[k] = v

# Define function x that given a command string, runs it with subprocess and streams the output
def x(cmd):
    return subprocess.run(cmd.split(" ")).returncode

RUN_LANGUAGE = 'java'
RUN_LANGUAGE = 'go' if 'go' in DATA_FOLDER.lower() else RUN_LANGUAGE
RUN_LANGUAGE = 'py' if 'python' in DATA_FOLDER.lower() else RUN_LANGUAGE
RUN_LANGUAGE = 'js' if 'javascript' in DATA_FOLDER.lower() or 'js' in DATA_FOLDER.lower() else RUN_LANGUAGE
RUN_LANGUAGE = 'all' if 'all' in DATA_FOLDER.lower() else RUN_LANGUAGE

RUN_NAME = f"{BASE_MODEL.split('/')[-1]}" + \
    f"_bs{BATCH_SIZE}_len{MAX_LEN}_lang{RUN_LANGUAGE}" + \
    ("_diff" if USE_DIFF_MODE else "") + \
    ("_debug" if DEBUG else "")

RUN_NOTES = f"""Notes:
Dataset: {DATA_FOLDER}
Max steps: {MAX_STEPS}

Learning rate: {LEARNING_RATE}

LLM parameters:
LORA_R: {LORA_R}
LORA_ALPHA: {LORA_ALPHA}
LORA_DROPOUT: {LORA_DROPOUT}
Prompt: {LLM_PROMPT}
Instruction: {LLM_INSTRUCTION}
Input: {LLM_INPUT}
Output 0: {LLM_OUTPUT_0}
Output 1: {LLM_OUTPUT_1}
"""

try:
    os.environ['LD_LIBRARY_PATH'] = '/opt/conda/lib:' + os.environ['LD_LIBRARY_PATH']
except KeyError:
    pass


import sklearn.metrics

# create list of 10 0's and 10 1's
truths = [0] * 10 + [1] * 10
predictions = [1] * 20
sample_weights = [19] * 10 + [1] * 10
weighted_f1 = sklearn.metrics.f1_score(truths, predictions, sample_weight=sample_weights)
print(weighted_f1)

0.09523809523809523


In [None]:
assert x('pip install -U scikit-learn wandb accelerate transformers[sentencepiece] matplotlib tokenizers bitsandbytes peft datasets') == 0

In [None]:
import os

# clone repo if this is just the notebook file (current folder is not 'rq1'), then cd to the cloned repo
if not os.path.abspath(os.getcwd()).endswith('rq2'):
    # If llms-for-code-comment-consistency exists, then go in and pull any updates
    if os.path.exists('llms-for-code-comment-consistency'):
        os.chdir('llms-for-code-comment-consistency')
        # Update if already exists
        try:
            assert x('git pull origin main --ff-only') == 0
        except AssertionError:
            # old version of git doesn't support --ff-only
            assert x('git pull origin main') == 0
    else:
        assert x(f'git clone {REPO_URL}') == 0
        os.chdir('llms-for-code-comment-consistency')
else:
    # cd to parent of this folder for the root of the repo
    os.chdir(os.path.dirname(os.getcwd()))

sys.path.append('lib')

In [3]:
def ensure_data_folder(folder):
    # If the data folder does not exist, extract it from the .tar.gz file
    if not os.path.exists(folder):
        print('Extracting data...')
        if not os.path.exists(f'{folder}.tar.gz'):
            print('Downloading data from server...')
            assert x(f'wget -O {folder}.tar.gz https://file2.pelmers.com/{folder}.tar.gz') == 0
        assert x(f'tar -xzf {folder}.tar.gz -C data') == 0

ensure_data_folder(DATA_FOLDER)

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (10.3 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.3/10.3 MB 79.0 MB/s eta 0:00:00
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ecco 0.1.2 requires scikit-learn~=0.23, but you have scikit-learn 1.3.0 which is incompatible.


Successfully installed scikit-learn-1.3.0


In [4]:
import wandb
wandb.login(key=WANDB_KEY)
run = wandb.init(project=f"{BASE_MODEL.split('/')[-1]}-model-master")
if MAX_STEPS == 0:
    run.name = RUN_NAME + "_eval"
else:
    run.name = RUN_NAME
run.notes = RUN_NOTES

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpelmers[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/ubuntu/.netrc


In [6]:
# Adapted hugging face example: https://github.com/huggingface/peft/blob/main/examples/fp4_finetuning/finetune_fp4_opt_bnb_peft.py
# And: https://github.com/tloen/alpaca-lora/blob/main/finetune.py

import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, LlamaForCausalLM, LlamaTokenizer

dm = 'auto'
if DEBUG and not torch.cuda.is_available():
    print("WARNING: CUDA not available, using CPU")
    dm = 'cpu'

if torch.cuda.is_available():
    x('nvidia-smi')

if 'llama' in BASE_MODEL:
    my_model = LlamaForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map=dm,
    )

    tokenizer = LlamaTokenizer.from_pretrained(BASE_MODEL)
else:
    # use auto classes
    my_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        load_in_8bit=True,
        torch_dtype=torch.float16,
        device_map=dm,
        trust_remote_code=True,
    )

    if 'mosaic' in BASE_MODEL:
        tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', trust_remote_code=True)
    else:
        tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"  # Allow batched inference



Downloading (…)resolve/main/norm.py: 100%|██████████| 2.56k/2.56k [00:00<00:00, 2.75MB/s]
A new version of the following files was downloaded from https://huggingface.co/replit/replit-code-v1-3b:
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/replit/replit-code-v1-3b:
- norm.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading (…)n/adapt_tokenizer.py: 100%|██████████| 1.75k/1.75k [00:00<00:00, 3.15MB/s]
A new version of the following files was downloaded from https://huggingface.co/replit/replit-code-v1-3b:
- adapt_tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading

You are using config.init_device='cpu', but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ubuntu/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
'NoneType' object has no attribute 'cadam32bit_grad_fp32'
CUDA SETUP: Loading binary /home/ubuntu/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...
/home/ubuntu/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: cannot open shared object file: No such file or directory


  warn("The installed version of bitsandbytes was compiled without GPU support. "
Downloading (…)neration_config.json: 100%|██████████| 91.0/91.0 [00:00<00:00, 46.5kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 400/400 [00:00<00:00, 546kB/s]
Downloading (…)plit_lm_tokenizer.py: 100%|██████████| 6.26k/6.26k [00:00<00:00, 4.80MB/s]
A new version of the following files was downloaded from https://huggingface.co/replit/replit-code-v1-3b:
- replit_lm_tokenizer.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
Downloading spiece.model: 100%|██████████| 708k/708k [00:00<00:00, 315MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 87.0/87.0 [00:00<00:00, 90.3kB/s]


In [5]:
for param in my_model.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

try:
  my_model.gradient_checkpointing_enable()  # reduce number of stored activations
  my_model.enable_input_require_grads()
except:
  print("WARNING: gradient checkpointing not available")


# Also for stability, cast the output to fp32
class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float32)
try:
  my_model.lm_head = CastOutputToFloat(my_model.lm_head)
except:
  print("WARNING: could not cast output to float")

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(model)
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

from peft import LoraConfig, get_peft_model 

target_modules = ["q_proj", "v_proj"]

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=target_modules,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)

if LOAD_DIR:
    # Load peft adapter from checkpoint
    from peft import PeftModel
    my_model = PeftModel.from_pretrained(my_model, LOAD_DIR)
else:
    my_model = get_peft_model(my_model, config)

print_trainable_parameters(my_model)

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [13]:
# Next prepare the dataset
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=MAX_LEN,
        padding=False,
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < MAX_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    add_eos_token = False
    user_prompt = LLM_PROMPT.format(**data_point)
    full_prompt = user_prompt + data_point["output"]
    tokenized_full_prompt = tokenize(full_prompt)
    tokenized_user_prompt = tokenize(
        user_prompt, add_eos_token=add_eos_token
    )
    user_prompt_len = len(tokenized_user_prompt["input_ids"])

    if add_eos_token:
        user_prompt_len -= 1

    tokenized_full_prompt["labels"] = [
        -100
    ] * user_prompt_len + tokenized_full_prompt["labels"][
        user_prompt_len:
    ]  # could be sped up, probably
    return tokenized_full_prompt

def load_comment_data(json_path):
    import datasets, difflib
    ds = datasets.load_dataset('json', data_files=json_path)
    def create_prompt(x):
        code = x['new_code_raw']
        if USE_DIFF_MODE:
            # Take the diff between old_code_raw and new_code_raw
            code = '\n'.join(difflib.ndiff(x['old_code_raw'].split('\n'), x['new_code_raw'].split('\n')))
            # remove all lines starting with ? since those are for human readability
            code = '\n'.join([line for line in code.split('\n') if not line.startswith('?')])
        return {'instruction': LLM_INSTRUCTION,
                'input': LLM_INPUT.format(comment=x['old_comment_raw'], code=code),
                'output': LLM_OUTPUT_0 if x['label'] == 0 else LLM_OUTPUT_1}
    # Cast the data to keys 'instruction', 'input', 'output'
    ds = ds.shuffle().map(create_prompt).map(generate_and_tokenize_prompt)
    # Remove all columns except for the input_ids, attention_mask, and labels
    ds = ds.select_columns(['input_ids', 'attention_mask', 'labels'])
    return ds['train']

train_ds = load_comment_data(os.path.join(DATA_FOLDER, 'train.json'))
# TODO: use valid
valid_ds = load_comment_data(os.path.join(DATA_FOLDER, 'test.json'))
test_ds = load_comment_data(os.path.join(DATA_FOLDER, 'test.json'))

Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/default-96de9ed252c3f5d0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 146.30it/s]
Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/default-19c4891d74fd8d4b/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 154.89it/s]
Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/default-ef7a1a55740e6c71/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)
100%|██████████| 1/1 [00:00<00:00, 150.51it/s]
                                                                

In [8]:
# Training loop

import transformers

trainer = transformers.Trainer(
    model=my_model, 
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=BATCH_SIZE, 
        per_device_eval_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=max(1, 16 // BATCH_SIZE),
        warmup_steps=WARMUP_STEPS,
        max_steps=MAX_STEPS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10, 
        optim="adamw_torch",
        report_to="wandb",
        output_dir=f'{RUN_NAME}_ckpt',
        load_best_model_at_end=True,
        evaluation_strategy="steps",
        eval_steps=EVAL_STEPS,
        save_steps=EVAL_STEPS,
        metric_for_best_model="eval_loss",
        eval_accumulation_steps=4,
    ),
    data_collator=transformers.DataCollatorForSeq2Seq(tokenizer, return_tensors='pt', padding=True, pad_to_multiple_of=8)
)

if MAX_STEPS > 0:
    trainer.train()

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA devices.

In [22]:
# Print a random generation from the training set

my_model.eval()

import random
from transformers import GenerationConfig

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

max_response_num_words = max(len(LLM_OUTPUT_0.split()), len(LLM_OUTPUT_1.split()))

def generate(model, tokenizer, data_point, max_length=max_response_num_words + 1):
    input_ids = data_point["input_ids"]
    labels = data_point["labels"]
    # Remove the user prompt from the input by looking for last label = -100
    user_input_ids = input_ids[: len(labels) - list(reversed(labels)).index(-100)]
    answer = tokenizer.decode(input_ids[len(user_input_ids):])
    generation_config = GenerationConfig(
        temperature=0.1,
        top_p=0.75,
        top_k=40,
        num_beams=1,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=torch.tensor(user_input_ids).unsqueeze(0).to(device),
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_length,
        )
    s = generation_output.sequences[0][len(user_input_ids):]
    decoded_s = tokenizer.decode(s)
    return decoded_s, answer

def print_random_generation(model, tokenizer, dataset):
    data_point = random.choice(dataset)
    print(f'Datapoint: {tokenizer.decode(data_point["input_ids"], skip_special_tokens=True)}')
    output, answer = generate(model, tokenizer, data_point)
    print(f'Answer: {answer}')
    print(f'Generated: {output}')
    label = 1 if LLM_OUTPUT_1 in answer else 0
    pred = 1 if LLM_OUTPUT_1 in output else 0
    print(f'Correct: {label == pred}')

print_random_generation(my_model, tokenizer, train_ds)

Input: Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Determine whether the following Comment is consistent with the Code.

### Input:
Comment: Returns the list of operations which are combined in this composite operation.
Code: 	public List<ITransactionalOperation> getOperations() {
		return operations;
	}



### Response:
The comment is not consistent.


AssertionError: Torch not compiled with CUDA enabled

In [None]:
import sklearn.metrics
import numpy as np
from tqdm import tqdm

def evaluate(model, tokenizer, dataset):
    model.eval()
    truths = []
    predictions = []
    bar = tqdm(dataset)
    for data_point in bar:
        output, answer = generate(model, tokenizer, data_point)
        label = 1 if LLM_OUTPUT_1 in answer else 0
        pred = 1 if LLM_OUTPUT_1 in output else 0
        truths.append(label)
        predictions.append(pred)
        # Update bar with running accuracy
        bar.set_description(f'Accuracy: {sklearn.metrics.accuracy_score(truths, predictions):.3f}')

    truths = np.array(truths)
    predictions = np.array(predictions)
    sample_weights = np.ones(len(truths))
    # make the sample_weights array the same size as the number of samples in the dataset, with 1 for positive and weight for negative
    sample_weights[truths == 0] = 19
    acc = sklearn.metrics.accuracy_score(truths, predictions)
    f1 = sklearn.metrics.f1_score(truths, predictions)
    weighted_f1 = sklearn.metrics.f1_score(truths, predictions, sample_weight=sample_weights)
    precision = sklearn.metrics.precision_score(truths, predictions)
    recall = sklearn.metrics.recall_score(truths, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'weighted_f1': weighted_f1,
        'precision': precision,
        'recall': recall,
    }

METRICS_KEYS = ['accuracy', 'f1', 'precision', 'recall', 'weighted_f1']

from pprint import pprint

# Print and log to wandb
test_metrics = evaluate(my_model, tokenizer, test_ds)
pprint(test_metrics)
wandb.log({'test_' + k: test_metrics[k] for k in METRICS_KEYS})

In [None]:
if MAX_STEPS > 0:
    save_name = f'{RUN_NAME}_test_{test_metrics["weighted_f1"]:.3f}'
    trainer.save_model(save_name)
    print(f'Saved model to {save_name}')
    # Tar the folder and upload to wandb
    x(f'tar -czf {save_name}.tar.gz {save_name}')
    wandb.save(f'{save_name}.tar.gz')
    print(f'Uploaded model to wandb run id {wandb.run.id}')

In [None]:
# Report results from additional test files
for path in ADDITIONAL_TEST_FILES:
    folder = os.path.dirname(path)
    ensure_data_folder(folder)
    print(f'Loaded test set from {path}')
    additional_ds = load_comment_data(path)
    additional_metrics = evaluate(my_model, tokenizer, additional_ds)
    pprint(additional_metrics)
    log_prefix = path.replace('.json', '').replace('data/', '')
    wandb.log({log_prefix + '_' + k: additional_metrics[k] for k in METRICS_KEYS})

In [None]:
# Finish
wandb.finish()
print('Done')