In [None]:
#CodeT5-RNN using SearchSortAlg dataset


import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5EncoderModel, AutoTokenizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import os

# Dataset class
class CodeDataset(Dataset):
    def __init__(self, codes, labels, tokenizer, max_length):
        self.codes = codes
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.codes)

    def __getitem__(self, idx):
        code = self.codes[idx]
        label = self.labels[idx]
        if isinstance(code, list):
            code = [str(i) for i in code]
        else:
            code = [str(code)]
        encodings = self.tokenizer(code, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt")
        input_ids = encodings['input_ids'].squeeze()
        attention_mask = encodings['attention_mask'].squeeze()
        return input_ids, attention_mask, label

# Model class
class CodeClassifier(nn.Module):
    def __init__(self, encoder, num_classes, hidden_dim):
        super(CodeClassifier, self).__init__()
        self.encoder = encoder
        self.gru = nn.LSTM(input_size=768, hidden_size=hidden_dim, batch_first=True, bidirectional=False)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        gru_outputs, _ = self.gru(encoder_outputs)
        pooled_output = torch.max(gru_outputs, 1)[0]
        pooled_output = self.dropout(pooled_output)
        logits = self.fc(pooled_output)
        return logits

# Function to load and split the custom dataset
def load_and_split_dataset(file_path, test_size=0.2, val_size=0.1):
    df = pd.read_csv(file_path)
    codes = df['Code'].tolist()
    labels = df['Class'].tolist()

    train_val_codes, test_codes, train_val_labels, test_labels = train_test_split(codes, labels, test_size=test_size, random_state=42)
    train_codes, valid_codes, train_labels, valid_labels = train_test_split(train_val_codes, train_val_labels, test_size=val_size, random_state=42)

    return (train_codes, train_labels), (valid_codes, valid_labels), (test_codes, test_labels)

# Encode labels
def encode_labels(labels):
    encoder = LabelEncoder()
    encoded_labels = encoder.fit_transform(labels)
    return encoded_labels, encoder

# Prepare DataLoader
def create_dataloaders(train_data, valid_data, batch_size, tokenizer, max_length):
    train_dataset = CodeDataset(train_data[0], train_data[1], tokenizer, max_length)
    valid_dataset = CodeDataset(valid_data[0], valid_data[1], tokenizer, max_length)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(valid_dataset, batch_size=batch_size)
    return train_loader, val_loader

# Train the model
def train_model(model, train_loader, val_loader, device, num_epochs, learning_rate=2e-5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.NAdam(model.parameters(), lr=learning_rate)
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for input_ids, attention_mask, labels in train_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}, Loss: {avg_train_loss:.4f}')

        # Validation
        model.eval()
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for input_ids, attention_mask, labels in val_loader:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                outputs = model(input_ids, attention_mask)
                preds = torch.argmax(outputs, dim=1)
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        accuracy = accuracy_score(val_labels, val_preds)
        print(f'Validation Accuracy: {accuracy:.4f}')

# Evaluate the model and print metrics
def evaluate_model(model, test_data, device, encoder, batch_size, tokenizer, max_length):
    test_dataset = CodeDataset(test_data[0], test_data[1], tokenizer, max_length)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)
    model.eval()
    test_preds = []
    test_labels = []
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for input_ids, attention_mask, labels in test_loader:
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            test_preds.extend(preds.cpu().numpy())
            test_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(test_labels, test_preds)
    precision = precision_score(test_labels, test_preds, average='weighted')
    recall = recall_score(test_labels, test_preds, average='weighted')
    f1 = f1_score(test_labels, test_preds, average='weighted')

    metrics = {
        'loss': total_loss / len(test_loader),
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

    # Generate classification report
    report = classification_report(test_labels, test_preds, target_names=encoder.classes_)
    print("Classification Report: \n", report)

    # Compute confusion matrix
    conf_matrix = confusion_matrix(test_labels, test_preds)

    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.classes_, yticklabels=encoder.classes_)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()

    return metrics

# Save the model
def save_model(model, path='code_classifier.pth'):
    create_directory_if_not_exists(path)
    torch.save(model.state_dict(), path)

# Load the model
def load_model(path, encoder, num_classes, hidden_dim):
    model = CodeClassifier(encoder, num_classes, hidden_dim)
    model.load_state_dict(torch.load(path))
    return model

# Ensure directory creation for saving models
def create_directory_if_not_exists(path):
    directory = os.path.dirname(path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)

# Main script
if __name__ == "__main__":
    # Load and split the dataset
    dataset_file = 'sort_search_row_converted_filtered.csv'
    (train_codes, train_labels), (valid_codes, valid_labels), (test_codes, test_labels) = load_and_split_dataset(dataset_file)

    # Encode labels
    train_labels, label_encoder = encode_labels(train_labels)
    valid_labels = label_encoder.transform(valid_labels)
    test_labels = label_encoder.transform(test_labels)

    # Load tokenizer and preprocess data
    pretrained_model_name = "Salesforce/codet5-base"  # Replace with your pre-trained model name
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name)
    max_length = 512

    # Prepare dataloaders
    batch_size = 8
    train_loader, val_loader = create_dataloaders((train_codes, train_labels), (valid_codes, valid_labels), batch_size, tokenizer, max_length)

    # Load the encoder model
    encoder_model = T5EncoderModel.from_pretrained(pretrained_model_name)

    # Build and train the model
    num_classes = len(label_encoder.classes_)
    hidden_dim = 512
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = CodeClassifier(encoder_model, num_classes, hidden_dim)

    train_model(model, train_loader, val_loader, device, num_epochs=5)

    # Save the trained model
    save_model(model, 'model_saves/code_classifier.pth')

    # Load the model for prediction
    loaded_model = load_model('model_saves/code_classifier.pth', encoder_model, num_classes, hidden_dim)
    loaded_model.to(device)

    # Evaluate the model on training, validation, and test datasets
    print("Train Dataset:")
    train_metrics = evaluate_model(loaded_model, (train_codes, train_labels), device, label_encoder, batch_size, tokenizer, max_length)

    print("Validation Dataset:")
    val_metrics = evaluate_model(loaded_model, (valid_codes, valid_labels), device, label_encoder, batch_size, tokenizer, max_length)

    print("Test Dataset:")
    test_metrics = evaluate_model(loaded_model, (test_codes, test_labels), device, label_encoder, batch_size, tokenizer, max_length)

    # Print metrics in the desired format
    print("\neval_loss\teval_accuracy\teval_f1\teval_precision\teval_recall")
    print(f"train\t{train_metrics['loss']:.6f}\t{train_metrics['accuracy']:.6f}\t{train_metrics['f1']:.6f}\t{train_metrics['precision']:.6f}\t{train_metrics['recall']:.6f}")
    print(f"val\t{val_metrics['loss']:.6f}\t{val_metrics['accuracy']:.6f}\t{val_metrics['f1']:.6f}\t{val_metrics['precision']:.6f}\t{val_metrics['recall']:.6f}")
    print(f"test\t{test_metrics['loss']:.6f}\t{test_metrics['accuracy']:.6f}\t{test_metrics['f1']:.6f}\t{test_metrics['precision']:.6f}\t{test_metrics['recall']:.6f}")

