<a href="https://colab.research.google.com/github/markNZed/GPT-NeoX-Colab/blob/main/notebooks/codecompletion_benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# We could modify these paths to "stub" behavior for test/dev
DOCKER = True
workspaceDir = "/content"
GPTNeoXColabDirName = "GPT-NeoX-Colab"
if DOCKER:
    GPTNeoXColabDir = f"/workspace"
else:
    GPTNeoXColabDir = f"{workspaceDir}/{GPTNeoXColabDirName}"

# Clone CodeXGLUE Repo

In [12]:
%cd {workspaceDir}
!git clone --depth 1 https://github.com/microsoft/CodeXGLUE.git

/content
fatal: destination path 'CodeXGLUE' already exists and is not an empty directory.


In [13]:
%%time
#@title Clone GPT-NeoX-Colab
if DOCKER:
    %cd {GPTNeoXColabDir}
else:
    %cd {workspaceDir}
    # Don't use --depth 1 because that does not play nice with git-annex
    !git clone https://github.com/markNZed/GPT-NeoX-Colab.git
    %cd {GPTNeoXColabDir}
    %pip install -q -r requirements_colab.txt
    %pip install --use-feature=fast-deps -q .
from dotenv import load_dotenv
import os
load_dotenv(f"{GPTNeoXColabDir}/.env")
import GPTNeoXColab
GPTNeoXColab.utils.colab.fetch_data("data/codecompletion/token_completion.tar.gz")
%cd {GPTNeoXColabDir}/data/codecompletion
if not os.path.exists(f"data/codecompletion/token_completion"):
    !tar -xzf token_completion.tar.gz
GPTNeoXColab.utils.colab.fetch_data("models/codecompletion/global_step7000_HF.tar.gz")
%cd {GPTNeoXColabDir}/models/codecompletion
if not os.path.exists(f"latest"):
    !tar -xzf global_step7000_HF.tar.gz
    !mv global_step7000_HF latest

/workspace


Data retrieval successful.
/workspace/data/codecompletion
Data retrieval successful.
/workspace/models/codecompletion
CPU times: user 191 ms, sys: 50.4 ms, total: 242 ms
Wall time: 15.6 s


# Using Byte-Pair Encoding Tokenizer

In [14]:
%cd {GPTNeoXColabDir}/models/codecompletion/latest
if not os.path.exists("gpt2-vocab.json"):
    !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
    !mv gpt2-vocab.json vocab.json
if not os.path.exists("gpt2-merges.txt"):
    !wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
    !mv gpt2-merges.txt merges.txt

/workspace/models/codecompletion/latest
--2024-11-15 08:00:17--  https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.12.118, 16.15.193.59, 3.5.24.248, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.12.118|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1042301 (1018K) [application/json]
Saving to: ‘gpt2-vocab.json’


2024-11-15 08:00:19 (680 KB/s) - ‘gpt2-vocab.json’ saved [1042301/1042301]

--2024-11-15 08:00:19--  https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 3.5.24.29, 52.216.12.118, 16.15.193.59, ...
Connecting to s3.amazonaws.com (s3.amazonaws.com)|3.5.24.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 456318 (446K) [text/plain]
Saving to: ‘gpt2-merges.txt’


2024-11-15 08:00:21 (330 KB/s) - ‘gpt2-merges.txt’ saved [456318/456318]



# HuggingFace Inference

In [18]:
from transformers import GPTNeoXForCausalLM, GPT2Tokenizer
import torch

%cd {workspaceDir}

# Initialize the tokenizer with your vocabulary and merge files
tokenizer = GPT2Tokenizer(vocab_file=f"{GPTNeoXColabDir}/models/codecompletion/latest/vocab.json", merges_file=f"{GPTNeoXColabDir}/models/codecompletion/latest/merges.txt")

# Load your model
model_path = f"{GPTNeoXColabDir}/models/codecompletion/latest"
model = GPTNeoXForCausalLM.from_pretrained(model_path)

# Set the model to evaluation mode
model.eval()

# Prompt the user for input
input_text = """<s> import sys , os <EOL> import imp <EOL> from optparse import make_option <EOL> from django . conf import settings <EOL> from django"""

# Tokenize and prepare input
input_ids = torch.tensor([tokenizer.encode(input_text)], dtype=torch.long)
attention_mask = torch.ones_like(input_ids)  # Create an attention mask for non-padded input

# Generate text with specified pad_token_id and attention_mask
with torch.no_grad():
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=200,          # Adjust this for desired output length
        temperature=0.7,        # Controls creativity
        top_k=50,               # Controls diversity
        top_p=0.9,              # Nucleus sampling
        num_return_sequences=1, # Number of sequences to return
        pad_token_id=model.config.eos_token_id,  # Set pad_token_id explicitly
        do_sample=True           # Enable sampling mode to use temperature and top_p
    )

# Decode the generated text
generated_text = tokenizer.decode(output[0].tolist())
print("Generated text:", generated_text)

# Function to replace special tokens with original representations
def replace_special_tokens(text):
    """
    Replaces special tokens in the generated text with more readable or context-appropriate representations.
    """
    replacements = {
        "<EOL>": "\n",          # Replace with actual newline for code formatting
        "<s>": "",              # Remove start token as it's not necessary in final output
        "</s>": "",             # Remove end token as it's not necessary in final output
        "<pad>": "",            # Remove padding tokens
        "<|UNKNOWN|>": "[UNK]", # Represent unknown tokens in a readable way
        "<STR_LIT>": "\"STRING_LITERAL\"",  # Placeholder for string literals
        "<NUM_LIT>": "0",       # Placeholder for numeric literals
        "<BOOL_LIT>": "True",   # Placeholder for boolean literals (e.g., True/False)
        "<COMMENT>": "# COMMENT",  # Placeholder for comments in the code
    }

    # Replace each special token in text with its corresponding value in `replacements`
    for token, replacement in replacements.items():
        text = text.replace(token, replacement)

    return text.strip()  # Strip leading/trailing whitespace for clean output

# Replace special tokens in the generated text
final_text = replace_special_tokens(generated_text)

# Print the final output
print("Final text:", final_text)


/content
Generated text: <s> import sys , os <EOL> import imp <EOL> from optparse import make_option <EOL> from django . conf import settings <EOL> from django . conf . urls import url <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import unittest <EOL> from django . utils import
Final text: import sys , os 
 import imp 
 from optparse import make_option 
 from django . conf import settings 
 from django . conf . urls import url 
 from django . utils import unittest 
 from django . utils import unittest 
 from django . utils import unittest 
 from django . utils import unittest 
 from django . utils import unittest 
 from django . util

In [29]:
from torch.utils.data import Dataset

class EvalDataset(Dataset):
    def __init__(self, tokenizer, args, logger, file_type='train', seq_length=1024):
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        cached_file = os.path.join(args.output_dir, file_type+"_blocksize_%d"%(seq_length))
        if os.path.exists(cached_file) and not args.overwrite_cache:
            with open(cached_file, 'rb') as handle:
                self.inputs = pickle.load(handle)

        else:
            self.inputs = []

            datafile = os.path.join(args.data_dir, f"{file_type}.txt")
            with open(datafile) as f:
                data = f.readlines()

            length = len(data)
            logger.info("Data size: %d"%(length))
            input_ids = []
            for idx,x in enumerate(data):
                x = x.strip()
                if x.startswith("<s>") and x.endswith("</s>"):
                    pass
                else:
                    x = "<s> " + x + " </s>"
                try:
                    input_ids.extend(tokenizer.encode(x))
                except Exception:
                    pass
                if idx % (length//10) == 0:
                    percent = idx / (length//10) * 10
                    logger.warning("load %d"%(percent))
                if args.max_eval_length is not None and idx > args.max_eval_length:
                    logger.info(f"max eval length reached at {idx}")
                    break
            del data
            gc.collect()

            logger.info(f"tokens: {len(input_ids)}")
            self.split(input_ids, tokenizer, logger, seq_length=seq_length)
            del input_ids
            gc.collect()

            with open(cached_file, 'wb') as handle:
                pickle.dump(self.inputs, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def split(self, input_ids, tokenizer, logger, seq_length=1024):
        sample = []
        i = 0
        while i < len(input_ids):
            sample = input_ids[i: i+seq_length]
            if len(sample) == seq_length:
                for j in range(seq_length):
                    if tokenizer.convert_ids_to_tokens(sample[seq_length-1-j])[0] == '\u0120' or tokenizer.convert_ids_to_tokens(sample[seq_length-1-j]).startswith("<NUM_LIT"):
                        break
                    if sample[seq_length-1-j] in [tokenizer.bos_token_id, tokenizer.eos_token_id, tokenizer.sep_token_id]:
                        if sample[seq_length-1-j] != tokenizer.bos_token_id:
                            j -= 1
                        break
                if j == seq_length-1:
                    print(tokenizer.decode(sample))
                    exit()
                sample = sample[: seq_length-1-j]
            # print(len(sample))
            i += len(sample)
            pad_len = seq_length-len(sample)
            sample += [tokenizer.pad_token_id]*pad_len
            self.inputs.append(sample)

            if len(self.inputs) % 10000 == 0:
                logger.info(f"{len(self.inputs)} samples")


    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        return torch.tensor(self.inputs[item])


In [None]:
import logging
import random
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler
from transformers import GPTNeoXForCausalLM, GPT2Tokenizer
from types import SimpleNamespace
import os

logger = logging.getLogger(__name__)

def decode_token_ids(token_ids, tokenizer):
    """
    Convert token IDs to a string of code, handling special tokens and spacing.
    """
    decoded_code = ""
    for token_id in token_ids:
        token = tokenizer.convert_ids_to_tokens(token_id)
        if token.startswith('\u0120') and not decoded_code.endswith(" "):  # Handles space prefixes
            decoded_code += " " + token[1:]
        else:
            decoded_code += token
    return decoded_code.strip()

def eval_acc(args, model, tokenizer, file_type='test'):
    """
    Evaluate the model’s token-level code completion accuracy.
    """
    # Load evaluation dataset
    eval_dataset = EvalDataset(tokenizer, args, logger, file_type=file_type, seq_length=args.seq_length)
    eval_dataloader = DataLoader(eval_dataset, sampler=SequentialSampler(eval_dataset), batch_size=args.eval_batch_size)
    model.to(args.device)
    model.eval()

    # Initialize counters for accuracy
    total_correct, total_predictions = 0, 0
    total_pred_tokens, total_gt_tokens = [], []

    # Iterate through batches in the evaluation dataset
    for step, batch in enumerate(eval_dataloader):
        inputs = batch.to(args.device)

        with torch.no_grad():
            outputs = model(inputs)
            predicted_token_ids = outputs.logits.argmax(-1)  # Get predicted tokens

        pred_ids = predicted_token_ids.cpu()
        gt_ids = inputs.cpu()

        # Process predictions and ground truths
        all_pred = []
        all_gt = []
        prev_pred = None
        for pred_seq, gt_seq in zip(pred_ids, gt_ids):
            pred_seq = pred_seq.tolist()
            gt_seq = gt_seq.tolist()

            now_pred = []
            now_gt = []
            for i, (pred_id, gt_id) in enumerate(zip(pred_seq, gt_seq)):
                gt_token = tokenizer.convert_ids_to_tokens(gt_id)
                pred_token = tokenizer.convert_ids_to_tokens(pred_id)

                if i == 0:
                    if gt_token in ["<s>", "</s>", "<EOL>", "<pad>"]:
                        now_gt = [gt_id]
                        now_pred = [0] if prev_pred is None else [prev_pred]
                        all_pred.append(decode_token_ids(now_pred, tokenizer).strip().split()[0])
                        all_gt.append(decode_token_ids(now_gt, tokenizer).strip())
                        now_gt = []
                        now_pred = []
                    else:
                        now_gt = [gt_id]
                        now_pred = [0] if prev_pred is None else [prev_pred]
                else:
                    if gt_token.startswith('\u0120'):
                        if len(now_gt) > 0:
                            try:
                                all_pred.append(decode_token_ids(now_pred, tokenizer).strip().split()[0])
                            except IndexError:
                                all_pred.append("<SPACE>")
                            all_gt.append(decode_token_ids(now_gt, tokenizer).strip())
                            now_gt = []
                            now_pred = []
                    if gt_token in ["<s>", "</s>", "<EOL>", "<pad>"] or gt_token.startswith("<NUM_LIT"):
                        if len(now_gt) > 0:
                            try:
                                all_pred.append(decode_token_ids(now_pred, tokenizer).strip().split()[0])
                            except IndexError:
                                all_pred.append("<SPACE>")
                            all_gt.append(decode_token_ids(now_gt, tokenizer).strip())
                        now_gt = [gt_id]
                        now_pred = [pred_seq[i-1]]
                        try:
                            all_pred.append(decode_token_ids(now_pred, tokenizer).strip().split()[0])
                        except IndexError:
                            all_pred.append("<SPACE>")
                        all_gt.append(decode_token_ids(now_gt, tokenizer).strip())
                        now_gt = []
                        now_pred = []
                        continue
                    now_gt.append(gt_id)
                    now_pred.append(pred_seq[i-1])

        assert len(all_pred) == len(all_gt)

        total_pred_tokens.extend(all_pred)
        total_gt_tokens.extend(all_gt)

        # Calculate batch accuracy
        for pred_token, gt_token in zip(all_pred, all_gt):
            if gt_token not in ["<s>", "</s>", "<EOL>", "<pad>"]:
                total_predictions += 1
                if pred_token == gt_token:
                    total_correct += 1

        # Logging progress
        if step % args.logging_steps == 0:
            accuracy = total_correct / total_predictions if total_predictions > 0 else 0
            logger.info(f"Step {step} processed with cumulative accuracy: {accuracy:.2%}")

    # Final accuracy calculation
    accuracy = total_correct / total_predictions if total_predictions > 0 else 0
    logger.info(f"Final Test Accuracy: {accuracy:.2%}")


    # Call post_process to generate predictions.txt and answers.txt
    pred_file = os.path.join(args.output_dir, "predictions.txt")
    gt_file = os.path.join(args.output_dir, "answers.txt")
    true_texts = open(os.path.join(args.data_dir, f"{file_type}.txt")).readlines()
    total_samples = post_process(args, total_pred_tokens, total_gt_tokens, true_texts, pred_file, gt_file)
    logger.info(f"Evaluated on {total_samples} samples, saved predictions at {pred_file} and ground truths at {gt_file}")


    return total_predictions, total_correct

def post_process(args, preds, gts, true_gts, pred_file_path, gt_file_path):
    """
    Save the post-processed predictions and ground truths, and verify with the expected true ground truths.

    Args:
        args: General arguments or configuration settings (unused here).
        preds: List of predicted tokens from the model.
        gts: List of ground truth tokens for each prediction.
        true_gts: List of full ground truth sequences for each input, used for verification.
        pred_file_path: Path to the file where the processed predictions will be saved.
        gt_file_path: Path to the file where the processed ground truths will be saved.

    Returns:
        int: The count of sequences processed and saved.
    """
    with open(pred_file_path, "w") as pred_file, open(gt_file_path, "w") as gt_file:
        count = 0
        new_gt = []
        new_pred = []

        for pred, gt in zip(preds, gts):
            if gt in ["", "<pad>"]:
                continue
            new_gt.append(gt)
            new_pred.append(pred.replace(" ", ""))

            if gt == "</s>":
                gt_str = " ".join(new_gt)
                pred_str = " ".join(new_pred)
                assert gt_str == true_gts[count].strip(), f"Sample {count} mismatch between ground truth and expected text"
                pred_file.write(pred_str + "\n")
                gt_file.write(gt_str + "\n")
                count += 1
                new_gt = []
                new_pred = []

    return count


def main():
    """
    Main function to load model, tokenizer, and execute evaluation.
    """
    pretrained_model_path = f"{GPTNeoXColabDir}/models/codecompletion/latest"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Set up evaluation arguments
    args = {
        "n_gpu": torch.cuda.device_count(),
        "per_gpu_eval_batch_size": 1,
        "logging_steps": 1, 
        "output_dir": f"{GPTNeoXColabDir}/out",
        "data_dir": f"{GPTNeoXColabDir}/data/codecompletion/token_completion",
        "device": device,
        "no_cuda": False,
        "seq_length": 2048,
        "max_eval_length": 10,
        "overwrite_cache": True,
        "eval_batch_size": 1,
        "local_rank": -1,
    }

    # Wrap args dictionary in a namespace to allow dot notation
    args = SimpleNamespace(**args)

    # Configure logging
    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO)

    # Set random seed for reproducibility
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

    # Load model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_path, sep_token='<EOL>', bos_token='<s>', eos_token='</s>', pad_token='<pad>', unk_token='<|UNKNOWN|>')
    model = GPTNeoXForCausalLM.from_pretrained(pretrained_model_path)
    model.resize_token_embeddings(len(tokenizer))

    total_params = sum(p.numel() for p in model.parameters())
    if total_params >= 1e9:
        readable_params = f"{total_params / 1e9:.2f}B"  # Billions
    elif total_params >= 1e6:
        readable_params = f"{total_params / 1e6:.2f}M"  # Millions
    else:
        readable_params = f"{total_params:,}"  # Less than a million, use commas
    logger.info(f"Model has {readable_params} trainable parameters")

    # Evaluate model
    total_predictions, total_correct = eval_acc(args, model, tokenizer, 'test')
    accuracy = total_correct / total_predictions if total_predictions > 0 else 0
    logger.info(f"Test accuracy: {accuracy:.2%}")

if __name__ == "__main__":
    main()


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPTNeoXTokenizerFast'. 
The class this function is called from is 'GPT2Tokenizer'.
11/15/2024 08:45:23 - INFO - __main__ -   Model has 44.65M trainable parameters
11/15/2024 08:45:23 - INFO - __main__ -   Data size: 50000
11/15/2024 08:45:23 - INFO - __main__ -   max eval length reached at 11
11/15/2024 08:45:24 - INFO - __main__ -   tokens: 23540
11/15/2024 08:45:27 - INFO - __main__ -   Step 0 processed with cumulative accuracy: 30.84%
11/15/2024 08:45:30 - INFO - __main__ -   Step 1 processed with cumulative accuracy: 33.42%
11/15/2024 08:45:32 - INFO - __main__ -   Step 2 processed with cumulative accuracy: 33.31%
11/15/2024 08:45:35 - INFO - __main__ -   Step 3 processed with cumulative accuracy: 30.62%
11/15/2024 08:45:39 - INFO - __main__ -   Step 4 processed with cumulat

In [45]:
import subprocess
# Run evaluator.py on the generated files
evaluator_script = f"{GPTNeoXColabDir}/scripts/evaluator.py"
answers_file = f"{GPTNeoXColabDir}/out/answers.txt"
predictions_file = f"{GPTNeoXColabDir}/out/predictions.txt"
try:
    subprocess.run(
        ["python", evaluator_script, "--answers", answers_file, "--predictions", predictions_file],
        check=True,
    )
except subprocess.CalledProcessError as e:
    logger.error(f"Evaluator script failed with error: {e}")

INFO:__main__:Total 10600 tokens, accuracy: 34.69
