In [None]:
#CodeBERT-RNN using code_x_glue_cc_defect_detection dataset

import numpy as np
from datasets import load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel, TrainingArguments, Trainer, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler

# Load the dataset
dataset = load_dataset("code_x_glue_cc_defect_detection")
train_dataset = dataset['train']
val_dataset = dataset['validation']
test_dataset = dataset['test']

# Load the model and tokenizer
model_name = "microsoft/codebert-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
encoder_model = RobertaModel.from_pretrained(model_name, num_labels=2)

# Define the custom model
class RoBERTaWithGRU(nn.Module):
    def __init__(self, encoder_model, hidden_dim, num_labels):
        super(RoBERTaWithGRU, self).__init__()
        self.encoder = encoder_model
        self.gru = nn.LSTM(input_size=encoder_model.config.hidden_size, hidden_size=hidden_dim, bidirectional=False, batch_first=True)
        self.classifier = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_states = encoder_outputs.last_hidden_state
        _, hn = self.gru(last_hidden_states)
        logits = self.classifier(hn[-1])
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, labels)
        return {"loss": loss, "logits": logits}

num_labels = 2  # Assuming binary classification (defect or no defect)
hidden_dim = 125  # Example hidden dimension for GRU
model = RoBERTaWithGRU(encoder_model, hidden_dim, num_labels)


# Preprocess the data
def preprocess_function(examples):
    inputs = examples['func']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    # Ensure labels are of type Long
    model_inputs["labels"] = examples["target"]
    model_inputs["labels"] = torch.tensor(model_inputs["labels"], dtype=torch.long)
    return model_inputs

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])



# Training arguments
training_args = TrainingArguments(
    output_dir='./results',    
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=20,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=1e-5,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Define metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro',warn_for=('precision', 'recall', 'f-score'), sample_weight=None,zero_division=0)
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Custom Trainer to ensure loss function is handled correctly
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

    



#optimizer = Adafactor(model.parameters(), relative_step=False, lr=1e-5, weight_decay=0.01)
#lr_scheduler = AdafactorSchedule(optimizer)


optimizer = torch.optim.NAdam(model.parameters(), lr=1e-5)
lr_scheduler=lr_scheduler.ReduceLROnPlateau(optimizer)
    
# Initialize Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print("Validation Results:",eval_results)


test_results = trainer.evaluate(eval_dataset=test_dataset)
print("Testing Results:",test_results)


