In [None]:
import importlib
import os
import sys

import numpy as np
import pandas as pd
from datasets import Dataset
sys.path.append("../")

import src

importlib.reload(src)

from src.data_prep_utils import (  # noqa: E402
    conala_to_time_batches,
    load_time_sorted_conala,
)

importlib.reload(src.data_prep_utils)


from src.training import nd_inference, retraining, continual
importlib.reload(src.training)

: 

In [4]:
import os

In [7]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [22]:
MODEL = "CodeT5"
BATCH_SIZE = 15
DECODER_LENGTH = 20
ENCODER_LENGTH = 15

TRAIN_ARGS = {
    "BATCH_SIZE": BATCH_SIZE,
    "DECODER_LENGTH": DECODER_LENGTH,
    "ENCODER_LENGTH": ENCODER_LENGTH,
    "MODEL": MODEL,
    "SEQ_TRAINER_ARGS": {
        "overwrite_output_dir": True,
        "num_train_epochs": 2,
        "do_train": True,
        "do_eval": True,
        "per_device_train_batch_size": 4,
        "per_device_eval_batch_size": 4,
        "learning_rate": 5e-4,
        "warmup_steps": 100,
        "weight_decay": 0.1,
        "label_smoothing_factor": 0.1,
        "predict_with_generate": True,
        "logging_steps": 100,
        "save_total_limit": 1,
        "save_strategy": "epoch",
        "logging_strategy": "epoch",
        "evaluation_strategy": "epoch",
        "load_best_model_at_end": False,
    },
}

In [23]:
def load_ts_batch(DATE_STR, MODE, ts_batch_id):
    dataset = pd.read_csv(f"../data/processed/conala/{DATE_STR}/{MODE}/conala_batch_{ts_batch_id}.csv")
    if MODE not in ("gradual", "temporal", "sudden", "recurring"):
        raise ValueError("Invalid mode") 
    return dataset

In [27]:
DATE_STR = "20240326"
MODE = "gradual"

In [28]:
ds_0 = load_ts_batch(DATE_STR, MODE, 0)
ds_1 = load_ts_batch(DATE_STR, MODE, 1)

In [42]:
class C2TDataset(Dataset):
    def __init__(self,
                input_code,
                target_texts,
                tokenizer,
                max_input_length=15,
                max_target_length=20):
        self.input_code = input_code
        self.target_texts = target_texts
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.input_code)

    def __getitem__(self, idx):
        input_code = self.input_code[idx]
        target_text = self.target_texts[idx]

        input_encoding = self.tokenizer.encode_plus(
            input_code,
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        target_encoding = self.tokenizer.encode_plus(
            target_text,
            max_length=self.max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": input_encoding["input_ids"].flatten(),
            "attention_mask": input_encoding["attention_mask"].flatten(),
            "labels": target_encoding["input_ids"].flatten(),
            "decoder_attention_mask": target_encoding["attention_mask"].flatten()
        }

In [33]:
from transformers import (
    DataCollatorForSeq2Seq,
    RobertaTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Config
)
import torch

In [35]:
model_name = "Salesforce/codet5-base-multi-sum"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
config = T5Config.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, config=config)

In [37]:
from torch.utils.data import DataLoader, Dataset

In [43]:
dataset = C2TDataset(input_code=ds_0["snippet"].values,
                        target_texts=ds_0["rewritten_intent"],
                        tokenizer=tokenizer)
dataloader = DataLoader(dataset,
                        batch_size=4,
                        shuffle=True)

In [44]:
# Define training parameters
num_epochs = 2
learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


  return torch._C._cuda_getDeviceCount() > 0


In [1]:
import torch

In [2]:
torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False

In [10]:
device

NameError: name 'device' is not defined

In [None]:
model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        decoder_attention_mask = batch["decoder_attention_mask"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels,
                        decoder_attention_mask=decoder_attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}, Loss: {average_loss:.4f}")
    scheduler.step()

# Save the trained model
model.save_pretrained("t5_finetuned_seq2seq_model")
tokenizer.save_pretrained("t5_finetuned_seq2seq_model")