In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from sklearn.model_selection import train_test_split
from IPython.display import display, HTML
import torch, types, heapq, os
from datasets import Dataset
import pandas as pd

In [None]:
!nvidia-smi

display(HTML("<script>Jupyter.notebook.kernel.execute('config NotebookApp.iopub_msg_rate_limit=10000000000')</script>"))

In [None]:
def get_folders_in_directory(directory_path):
    folders_list = [folder for folder in os.listdir(
        directory_path) if os.path.isdir(os.path.join(directory_path, folder))]
    return folders_list


new_directory_path = "/kaggle/tmp"
os.makedirs(new_directory_path)

directory_path = "/kaggle/"
folders = get_folders_in_directory(directory_path)

print(folders)

In [None]:
model_path = input("Model Path: ")
data_path = input("Data Path: ")
huggingface_token = input("HuggingFace Token: ")
repo_name = input("Repository Name: ")

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

data = pd.read_csv(data_path)
inputs = data["input"].tolist()
targets = data["target"].tolist()

input_lengths = [len(tokenizer.tokenize(question)) for question in inputs]
top_5_input_max_lengths = heapq.nlargest(5, input_lengths)

target_lengths = [len(tokenizer.tokenize(code)) for code in targets]
top_5_target_max_lengths = heapq.nlargest(5, target_lengths)

train_inputs, val_inputs, train_targets, val_targets = train_test_split(
    inputs, targets, test_size=0.05, random_state=1)

train_df = pd.DataFrame({"input": train_inputs, "target": train_targets})
val_df = pd.DataFrame({"input": val_inputs, "target": val_targets})

train_data = Dataset.from_dict(
    {"input": train_df["input"], "target": train_df["target"]})
val_data = Dataset.from_dict(
    {"input": val_df["input"], "target": val_df["target"]})

args = types.SimpleNamespace(
    learning_rate=3e-4,
    train_batch_size=8,
    eval_batch_size=8,
    output_dir='/kaggle/tmp',
    num_train_epochs=10,
)

print(top_5_input_max_lengths)
print(top_5_target_max_lengths)

In [None]:
def convert_examples_to_features(example_batch):
    input_texts = example_batch["input"]
    target_texts = example_batch["target"]

    input_encodings = tokenizer(
        input_texts, padding="max_length", truncation=True, max_length=top_5_input_max_lengths[0])
    target_encodings = tokenizer(
        target_texts, padding="max_length", truncation=True, max_length=top_5_target_max_lengths[0])

    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }


train_pt = train_data.map(convert_examples_to_features, batched=True)
val_pt = val_data.map(convert_examples_to_features, batched=True)

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer_args = TrainingArguments(
    output_dir=args.output_dir,
    num_train_epochs=args.num_train_epochs,
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.train_batch_size,
    per_device_eval_batch_size=args.eval_batch_size,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    fp16=True,
    seed=1
)

trainer = Trainer(
    model=model,
    args=trainer_args,
    tokenizer=tokenizer,
    data_collator=seq2seq_data_collator,
    train_dataset=train_pt,
    eval_dataset=val_pt
)

torch.cuda.empty_cache()

trainer.train()

trainer.save_model("/kaggle/working/")

tokenizer.save_pretrained("/kaggle/working/")

In [None]:
!huggingface-cli login --token {huggingface_token}
!huggingface-cli repo create {repo_name} --type model -y
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained('/kaggle/working/')
finetuned_model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)