# Model Evaluation

This notebook will evaluate the following models:

- StarEncoder
- CodeBert
- CodeGen
- FLAN-T5
- CodeTrans

The architecture, dataset, and training approaches of each model are compared in [model_comparisons.md](model_comparisons.md).

Metrics will also be generated for each model:

- Perplexity (MLM, CLM)
- Accuracy, F1 Score, Precision, Recall (Text Classification)


In [1]:
import math

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    PreTrainedTokenizer,
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    IntervalStrategy,
    Trainer,
)

import constants
from utils import tokenize_dataset_example, prepare_starncoder_tokenizer

In [2]:
BATCH_SIZE = 64  # Sequences per batch
EVAL_STEPS = 20  # Number of batches for evaluation
EVAL_SIZE = EVAL_STEPS * BATCH_SIZE

In [None]:
# Load Datasets

raw_datasets = load_dataset("csv", data_files="./data/commits.csv").shuffle(seed=420)


def tokenize_function(tokenizer: PreTrainedTokenizer, text_column: str = "commit_msg"):
    def apply(example: dict):
        result = tokenizer(example[text_column])
        return result

    return apply


def concatenate_texts(max_input_length: int):
    def apply(examples: dict):
        concatenated_texts = {k: sum(examples[k], []) for k, v in examples.items()}
        total_length = len(concatenated_texts["input_ids"])
        # Remove excess texts
        cut_length = (total_length // max_input_length) * max_input_length
        # Split texts from cut_length based on max_input_length
        result = {
            k: [
                t[i : i + max_input_length]
                for i in range(0, cut_length, max_input_length)
            ]
            for k, t in concatenated_texts.items()
        }
        return result

    return apply

## StarEncoder

In [21]:
TOKENIZER_CHECKPOINT = "bigcode/starencoder"
MODEL_CHECKPOINT = "bigcode/starencoder"

MAX_INPUT_LENGTH = 128  # max 1024 - higher value requires more VRAM

In [None]:
# Prepare Model
model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)

# Prepare Tokenizer
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Datasets
tokenized_datasets = raw_datasets.map(
    tokenize_function(tokenizer, text_column="commit_msg"),
    batched=True,
    remove_columns=["commit_msg", "remote_url", "date", "sha", "labels"],
)
concatenated_datasets = tokenized_datasets.map(
    concatenate_texts(MAX_INPUT_LENGTH), batched=True
)

# Prepare Evaluator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

eval_dataset = concatenated_datasets["train"]
if EVAL_SIZE:
    eval_dataset = eval_dataset.select(range(EVAL_SIZE))

training_args = TrainingArguments(
    output_dir="./models/eval",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
)

In [None]:
# Run Evaluation

eval_results = trainer.evaluate()

In [26]:
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 12.10


## CodeBert


In [14]:
TOKENIZER_CHECKPOINT = "microsoft/codebert-base"
MODEL_CHECKPOINT = "microsoft/codebert-base"

MAX_INPUT_LENGTH = 128  # max 512 - higher value requires more VRAM

In [None]:
# Prepare Model
model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)

# Prepare Tokenizer
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Datasets
raw_datasets = load_dataset("csv", data_files="./data/commits.csv")
tokenized_datasets = raw_datasets.map(
    tokenize_function(tokenizer, text_column="commit_msg"),
    batched=True,
    remove_columns=["commit_msg", "remote_url", "date", "sha", "labels"],
)
concatenated_datasets = tokenized_datasets.map(
    concatenate_texts(MAX_INPUT_LENGTH), batched=True
)

# Prepare Evaluator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

eval_dataset = concatenated_datasets["train"]
if EVAL_SIZE:
    eval_dataset = eval_dataset.select(range(EVAL_SIZE))

training_args = TrainingArguments(
    output_dir="./models/eval",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
)

In [None]:
# Run Evaluation

eval_results = trainer.evaluate()

In [19]:
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 35125948.32


## CodeGen

In [4]:
TOKENIZER_CHECKPOINT = "Salesforce/codegen-350M-multi"
MODEL_CHECKPOINT = "Salesforce/codegen-350M-multi"

MAX_INPUT_LENGTH = 64  # max 2048 - higher value requires more VRAM

In [None]:
# Prepare Model
model = AutoModelForCausalLM.from_pretrained(MODEL_CHECKPOINT)

# Prepare Tokenizer
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Datasets
raw_datasets = load_dataset("csv", data_files="./data/commits.csv")
tokenized_datasets = raw_datasets.map(
    tokenize_function(tokenizer, text_column="commit_msg"),
    batched=True,
    remove_columns=["commit_msg", "remote_url", "date", "sha", "labels"],
)
concatenated_datasets = tokenized_datasets.map(
    concatenate_texts(MAX_INPUT_LENGTH), batched=True
)

# Prepare Evaluator

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=0.15
)

eval_dataset = concatenated_datasets["train"]
if EVAL_SIZE:
    eval_dataset = eval_dataset.select(range(EVAL_SIZE))

training_args = TrainingArguments(
    output_dir="./models/eval",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy="epoch",
    fp16=True # can comment out if enough VRAM
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    eval_dataset=eval_dataset,
)

In [None]:
# Run Evaluation

eval_results = trainer.evaluate()

In [7]:
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

>>> Perplexity: 213.49


## FLAN-T5


## CodeTrans