# Transfer Learning


In [1]:
import math

import torch
import numpy as np
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from transformers.trainer_utils import EvalPrediction
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from lib.trainer import get_trainer
from utils import (
    prepare_starncoder_tokenizer,
    get_latest_checkpoint,
    concat_tokens_to_chunks,
)
from config import BaseConfig, MLMConfig

## Domain Modelling (MLM)


In [None]:
# Prepare Datasets

git_commits = load_dataset(
    "neuralsentry/git-commits",
    split="train",
)


def prepare_datasets(git_commits: Dataset, tokenizer, config: BaseConfig):
    tokenized_dataset = (
        git_commits.map(
            lambda x: tokenizer(x["commit_msg"]),
            batched=True,
            remove_columns=["commit_msg", "sha", "remote_url", "date", "labels"],
        )
        .map(
            concat_tokens_to_chunks(chunk_size=config.MAX_INPUT_LENGTH),
            batched=True,
            num_proc=config.NUM_CPU_WORKERS,
        )
        .train_test_split(
            train_size=config.TRAIN_SIZE, test_size=config.EVAL_SIZE, seed=420
        )
    )

    train_dataset = tokenized_dataset["train"]
    eval_dataset = tokenized_dataset["test"]
    return (train_dataset, eval_dataset)

### StarEncoder


In [None]:
# Checkpoints
MODEL_CHECKPOINT = "bigcode/starencoder"
TOKENIZER_CHECKPOINT = "bigcode/starencoder"

# Prepare Models
model = AutoModelForMaskedLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Config
model_name = f"starencoder-finetuned-git-commits"
config = MLMConfig(
    NUM_TRAIN_EPOCHS=20,
    BATCH_SIZE=64,
    MAX_INPUT_LENGTH=256,
    PUSH_TO_HUB=True,
    HUB_MODEL_ID=f"neuralsentry/{model_name}",
)

# Prepare Datasets
train_dataset, eval_dataset = prepare_datasets(git_commits, tokenizer, config)

# Prepare Training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm_probability=config.MLM_PROB
)
training_args = TrainingArguments(
    output_dir=f"./models/{model_name}",
    overwrite_output_dir=True,
    learning_rate=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    fp16=config.FP16,
    evaluation_strategy="epoch",
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    logging_steps=math.floor(
        (len(train_dataset) // config.BATCH_SIZE) * config.LOGGING_STEPS
    ),
    save_strategy="epoch",
    push_to_hub=config.PUSH_TO_HUB,
    hub_model_id=config.HUB_MODEL_ID,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

In [None]:
results_before = trainer.evaluate()

In [12]:
print(f'>>> Perplexity: {math.exp(results_before["eval_loss"])}')

>>> Perplexity: 8.480668247075146


In [None]:
results_after = trainer.evaluate()

In [None]:
print(f'>>> Perplexity: {math.exp(results_after["eval_loss"])}')

>>> Perplexity: 3.2840768831826765


## Text Classification


### StarEncoder


In [None]:
# Checkpoints
MODEL_CHECKPOINT = "neuralsentry/starencoder-finetuned-git-commits"
TOKENIZER_CHECKPOINT = "neuralsentry/starencoder-finetuned-git-commits"

# Prepare Models
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT)
tokenizer = prepare_starncoder_tokenizer(TOKENIZER_CHECKPOINT)

# Prepare Config
model_name = f"starencoder-finetuned-class"
config = MLMConfig(
    NUM_TRAIN_EPOCHS=20,
    BATCH_SIZE=32,
    MAX_INPUT_LENGTH=512,
    PUSH_TO_HUB=True,
    HUB_MODEL_ID=f"neuralsentry/{model_name}",
)

# Prepare Datasets
raw_dataset = load_dataset("neuralsentry/git-commits-labelled", split="train")
tokenized_dataset = raw_dataset.map(
    lambda x: tokenizer(
        x["commit_msg"], truncation=True, max_length=config.MAX_INPUT_LENGTH
    ),
    batched=True,
    remove_columns=["commit_msg", "sha", "remote_url", "date"],
)
split_dataset = tokenized_dataset.train_test_split(
    train_size=config.TRAIN_SIZE, test_size=config.EVAL_SIZE, seed=420
)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

# Prepare Training
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir=f"./models/{model_name}",
    overwrite_output_dir=True,
    learning_rate=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    fp16=config.FP16,
    evaluation_strategy="epoch",
    num_train_epochs=config.NUM_TRAIN_EPOCHS,
    logging_steps=math.floor(
        (len(train_dataset) // config.BATCH_SIZE) * config.LOGGING_STEPS
    ),
    save_strategy="epoch",
    push_to_hub=config.PUSH_TO_HUB,
    hub_model_id=config.HUB_MODEL_ID,
)


def compute_metrics(pred: EvalPrediction):
    preds = np.argmax(pred.predictions[0], axis=-1)
    precision = precision_score(y_true=pred.label_ids, y_pred=preds)
    recall = recall_score(y_true=pred.label_ids, y_pred=preds)
    f1 = f1_score(y_true=pred.label_ids, y_pred=preds)
    accuracy = accuracy_score(y_true=pred.label_ids, y_pred=preds)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
trainer.push_to_hub()