In [1]:
import os
import sys
DIR_PREFIX = ".."

sys.path.insert(0, DIR_PREFIX)
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [2]:
import re
import torch
import torch.nn as nn
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import pandas as pd
import numpy as np
import datasets
from peft import PeftModel, PeftConfig
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from transformers import TrainingArguments, Trainer
import wandb
from transformers import TrainerCallback
from accelerate import Accelerator
from transformers import AutoModelForSeq2SeqLM
from peft import PrefixTuningConfig
from datasets import load_dataset
from pprint import pprint
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import evaluate

#import custom scripts
from CommitChronicle_preprocessing.DatasetParser import DatasetParser

ModuleNotFoundError: No module named 'CommitChronicle_preprocessing'

In [3]:
!huggingface-cli login --token $(cat $DIR_PREFIX"/tokens/hf.token")
!huggingface-cli login --token $(cat $DIR_PREFIX"/tokens/hf_write_token.txt")
!wandb login --relogin $(cat $DIR_PREFIX"/tokens/wandb_token.txt")
%env WANDB_PROJECT=commit_message_generation

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful
Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
env: WANDB_PROJECT=commit_message_generation


In [3]:
checkpoint = "Salesforce/codet5p-220m"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint, device_map="auto", local_files_only=True
)

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

parser = DatasetParser(tokenizer)
parser.add_special_tokens(tokenizer, model)
len(tokenizer)

32108

In [6]:
dataset = load_dataset(DIR_PREFIX + "/tokenized_CommitChronicle")
train_data = dataset['train']
val_data = dataset['validation']

Resolving data files:   0%|          | 0/182 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/38 [00:00<?, ?it/s]

In [7]:
val_data = val_data.map(parser.add_tokens_to_msg, num_proc=16)
train_data = train_data.map(parser.add_tokens_to_msg, num_proc=16)

In [8]:
def tokenize_msgs(example):
    example["labels"] = tokenizer(example["message"], return_tensors="pt",
                                padding='max_length', truncation=True)['input_ids']
    example["labels"][example['labels'] == tokenizer.pad_token_id] = -100
    return example

# train_data = train_data.map(tokenize_msgs, num_proc=12)
# val_data = val_data.map(tokenize_msgs, num_proc=12)

In [9]:
val_data = val_data.map(tokenize_msgs, num_proc=8)
train_data = train_data.map(tokenize_msgs, num_proc=8)

In [10]:
eos_token_id = tokenizer.encode('</commit_msg>')[1]

In [11]:
model.config.eos_token_id = eos_token_id
model.decoder.config.eos_token_id = eos_token_id
model.encoder.config.eos_token_id = eos_token_id

In [12]:
def squeeze(example):
    example['labels'] = torch.squeeze(example['labels'])
    example['input_ids'] = torch.squeeze(example['input_ids'])
    example['attention_mask'] = torch.squeeze(example['attention_mask'])
    return example

val_data.set_format('torch')
train_data.set_format('torch')

train_data = train_data.map(squeeze, num_proc=8)
val_data = val_data.map(squeeze, num_proc=8)

In [13]:
def model_inference(model, tokenizer, text, return_dict=False, seqs=5):
    prompt = text
    input = tokenizer(prompt, return_tensors="pt", truncation=True, padding='max_length').to(device)
    model.eval()
    with torch.no_grad():
        sample_outputs = model.generate(
            **input,
            max_new_tokens=200,
            top_k=100,
            num_return_sequences=seqs,
            num_beams=5,
            no_repeat_ngram_size=2,
            do_sample=True,
            early_stopping=True,
            top_p=0.95,
        )
    if not return_dict:
        for i, sample_output in enumerate(sample_outputs):
            print(
                "{}: {}".format(
                    i, tokenizer.decode(sample_output, skip_special_tokens=True)
                )
            )
            print("-" * 80)
    else:
        res = []
        for i, sample_output in enumerate(sample_outputs):
            res.append(tokenizer.decode(sample_output, skip_special_tokens=False))
        return res

In [14]:
def random_sample_prediction(model, tokenizer, data):
    n = np.random.randint(0, len(data))
    sample = data[n]
    print(f"Index - {n}")
    # pprint(f"Language - {sample['language']}")
    print(f"{'CODE_DIFSS':=^75}")
    print(sample["model_input"])
    print(f"{'MESSAGE':=^75}")
    print(data[n]["message"], '\n')
    print(f"{'GENERATED_MESSAGE':=^75}")
    generated = model_inference(model, tokenizer, sample["model_input"], return_dict=True)
    for elem in generated:
        print(elem)
        print("="*75)

In [15]:
gen_config = transformers.GenerationConfig(
            max_new_tokens=128,
            top_k=100,
            num_beams=5,
            no_repeat_ngram_size=2,
            do_sample=True,
            early_stopping=True,
            top_p=0.95,
            bos_token_id=1,
            decoder_start_token_id=0,
            eos_token_id=eos_token_id,
            pad_token_id=0,
        )

In [16]:
bertscore = evaluate.load("bertscore")
bleu = evaluate.load('sacrebleu')


train_args = transformers.Seq2SeqTrainingArguments(
    f"checkpoints_v3",
    evaluation_strategy = "steps",
    logging_strategy='steps',
    logging_steps=10000,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=75000,
    save_total_limit=4,
    num_train_epochs=1,
    predict_with_generate=True,
    report_to='wandb',
    bf16=True,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    optim="adamw_torch",
    warmup_steps=100,
    run_name='codeT5_V2_commit_msg_added',
    generation_config=gen_config,
)

data_collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    try:
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    except Exception:
        print(preds)
        print("="*100)
        print(labels)
        raise ZeroDivisionError


    result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang='en', device=device)
    result = np.array(result["f1"]).mean()
    result = {"BERTscore": result}
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result['BLEU'] = bleu_score['score']
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [17]:
random_sample = np.random.randint(0, len(val_data)-1, size=1500)
val_samples = val_data.select(random_sample)

In [18]:
trainer = transformers.Seq2SeqTrainer(
    model,
    train_args,
    train_dataset=train_data,
    eval_dataset=val_samples,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mnikitasergeev692[0m ([33mnary[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Bertscore,Bleu,Gen Len
10000,3.0752,2.743871,0.8696,5.059,42.0347
20000,2.831,2.692919,0.8722,5.1806,40.1947
30000,2.7579,2.659077,0.8731,5.5281,36.136
40000,2.7171,2.633897,0.8733,3.9699,34.56
50000,2.6813,2.620397,0.8742,4.8042,30.072
60000,2.6494,2.612131,0.8742,4.7148,33.4187
70000,2.6264,2.602598,0.8757,4.805,33.8747


In [21]:
wandb.finish()

0,1
eval/BERTscore,▁▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇▇██▇███
eval/BLEU,▆▆█▁▅▄▅▅▄▆▃▄▆▂▄▃▄▂▄▃▄▃▅
eval/gen_len,█▇▅▄▂▄▄▄▃▃▄▄▅▁▂▄▄▂▃▄▄▂▃
eval/loss,█▆▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
eval/runtime,▄▂▂▁▄▇▆▂▅▆▆▇█▄▅▅▇▅▆█▇▆▆
eval/samples_per_second,▄▇▇█▄▂▂▆▃▂▂▂▁▄▃▃▁▃▂▁▂▂▃
eval/steps_per_second,▄▆▇█▄▂▃▆▃▃▃▂▁▄▄▃▂▃▃▂▂▂▃
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇██
train/learning_rate,██▇▇▇▆▆▆▅▅▅▅▄▄▄▃▃▃▂▂▂▁▁

0,1
eval/BERTscore,0.8765
eval/BLEU,4.9595
eval/gen_len,31.4667
eval/loss,2.51152
eval/runtime,2035.3471
eval/samples_per_second,0.737
eval/steps_per_second,0.023
train/epoch,1.0
train/global_step,239359.0
train/learning_rate,0.0
