In [3]:
from transformers import BertForSequenceClassification, BertTokenizer, RobertaTokenizerFast, RobertaForSequenceClassification, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification, XLNetForSequenceClassification, XLNetTokenizerFast
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import pandas as pd
import os

from sklearn.preprocessing import LabelEncoder

In [4]:
def fine_tune_plm(gpu_numbers: str, seed: int, feature: str, save_model_path: str):
    """
    Fine-tunes a pre-trained language model (PLM) for sequence classification on a specific feature.
    Args:
        gpu_numbers (str): Comma-separated string of GPU numbers to use.
        seed (int): Random seed for reproducibility.
        feature (str): The target feature/label column in the dataset (e.g., 'hawkish', 'forward_looking').
        save_model_path (str): Path to save the fine-tuned model and tokenizer.
    Returns:
        list: Experiment results including training and testing metrics.
    """
    # GPU setup
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_numbers
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Load the dataset
    data = pd.read_csv("data.csv")  # Adjust file path as needed
    # print("Dataset columns:", data.columns)  # Debugging: print columns

    # Ensure the feature exists
    if feature not in data.columns:
        raise ValueError(f"Feature '{feature}' not found in dataset columns: {data.columns}")

    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

    # Split data
    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=seed)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=seed)

    # Select a pre-trained model and tokenizer
    num_labels = len(data[feature].unique())
    model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained("roberta-large")
    model.to(device)

    # Preprocessing
    def preprocess_data(dataset, feature):
        """
        Preprocesses the dataset for tokenization.
        """
        # print(f"Processing feature: {feature}")  # Debugging
        # print(dataset.head())  # Debugging

        # Ensure feature exists in the dataset
        if feature not in dataset.columns:
            raise ValueError(f"Feature '{feature}' not found in dataset columns: {dataset.columns}")

        sentences = dataset["sentences"].tolist()  # Replace "sentences" with the text column
        labels = dataset[feature].tolist()
        tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        return TensorDataset(tokens['input_ids'], tokens['attention_mask'], torch.LongTensor(labels))

    # Preprocess datasets
    train_dataset = preprocess_data(train_data, feature)
    val_dataset = preprocess_data(val_data, feature)
    test_dataset = preprocess_data(test_data, feature)

    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # TODO: Define optimizer
    optimizer = optim.AdamW([
        {'params': model.roberta.parameters(), 'lr': 1e-5},
        {'params': model.classifier.parameters(), 'lr': 1e-4}
    ])

    # Training and Validation Loop
    max_num_epochs = 20
    early_stopping_count = 0
    best_loss = float('inf')

    for epoch in range(max_num_epochs):
        # TODO: Implement training logic
        model.train()
        for batch in train_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # TODO: Implement validation logic
        model.eval()
        val_loss, val_f1, val_precision, val_recall = 0, 0, 0, 0
        for batch in val_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks, labels=labels)
            val_loss += outputs.loss.item()
            # TODO: Compute F1, precision, and recall for validation
            preds = torch.argmax(outputs.logits, dim=1)
            val_f1 += f1_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            val_precision += precision_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            val_recall += recall_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)

        val_loss /= len(val_dataloader)  # Average validation loss
        val_f1 /= len(val_dataloader)   # Average F1 score
        val_precision /= len(val_dataloader)  # Average precision
        val_recall /= len(val_dataloader)  # Average recall
        print(f"Validation Loss: {val_loss}, F1: {val_f1}, Precision: {val_precision}, Recall: {val_recall}")

        # TODO: Update early stopping counter
        if val_loss < best_loss:
            best_loss = val_loss
            early_stopping_count = 0
            torch.save({'model_state_dict': model.state_dict()}, 'best_model.pt')
        else:
            early_stopping_count += 1

        # TODO:
        # Add logging/print statements to monitor training progress

        # Break if early stopping condition is met
        if early_stopping_count >= 5:
            break

    # TODO: Load the best model
    checkpoint = torch.load('best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])

    # Testing
    # Uncomment the following line once the model is loaded
    model.eval()
    test_loss, test_accuracy, test_f1, test_precision, test_recall = 0, 0, 0, 0, 0  # Initialize test metrics

    with torch.no_grad():
        # TODO: Implement test evaluation
        for batch in test_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks, labels=labels)
            test_loss += outputs.loss.item()
            # TODO: Compute F1, precision, and recall for test evaluation
            preds = torch.argmax(outputs.logits, dim=1)
            test_accuracy += accuracy_score(labels.cpu(), preds.cpu())
            test_f1 += f1_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            test_precision += precision_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            test_recall += recall_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)

        pass

    # TODO: Compute average test loss, accuracy, F1, precision, and recall
    test_loss /= len(test_dataloader)
    test_accuracy /= len(test_dataloader)
    test_f1 /= len(test_dataloader)
    test_precision /= len(test_dataloader)
    test_recall /= len(test_dataloader)

    print(f"Test Metrics: Loss: {test_loss}, Accuracy: {test_accuracy}, F1: {test_f1}, Precision: {test_precision}, Recall: {test_recall}")

    # Save model
    if save_model_path:
        model.save_pretrained(save_model_path)
        tokenizer.save_pretrained(save_model_path)
        pass

    return [seed, feature, test_loss, test_accuracy, test_f1, test_precision, test_recall]

In [7]:
def fine_tune_deberta(gpu_numbers: str, seed: int, feature: str, save_model_path: str):
    """
    Fine-tunes the DeBERTa language model (microsoft/deberta-v3-large) for sequence classification on a specific feature.
    Args:
        gpu_numbers (str): Comma-separated string of GPU numbers to use.
        seed (int): Random seed for reproducibility.
        feature (str): The target feature/label column in the dataset (e.g., 'hawkish', 'forward_looking').
        save_model_path (str): Path to save the fine-tuned model and tokenizer.
    Returns:
        list: Experiment results including training and testing metrics.
    """
    # GPU setup
    os.environ["CUDA_VISIBLE_DEVICES"] = gpu_numbers
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    # Load the dataset
    data = pd.read_csv("data.csv")  # Adjust file path as needed

    # Ensure the feature exists
    if feature not in data.columns:
        raise ValueError(f"Feature '{feature}' not found in dataset columns: {data.columns}")

    label_encoder = LabelEncoder()
    data[feature] = label_encoder.fit_transform(data[feature])

    # Split data
    train_data, temp_data = train_test_split(data, test_size=0.2, random_state=seed)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=seed)

    # Select a pre-trained model and tokenizer
    num_labels = len(data[feature].unique())
    model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-large", num_labels=num_labels)
    tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")
    model.to(device)

    # Preprocessing
    def preprocess_data(dataset, feature):
        """
        Preprocesses the dataset for tokenization.
        """
        sentences = dataset["sentences"].tolist()  # Replace "sentences" with the text column
        labels = dataset[feature].tolist()
        tokens = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True, max_length=512)
        return TensorDataset(tokens['input_ids'], tokens['attention_mask'], torch.LongTensor(labels))

    # Preprocess datasets
    train_dataset = preprocess_data(train_data, feature)
    val_dataset = preprocess_data(val_data, feature)
    test_dataset = preprocess_data(test_data, feature)

    # Create DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    # Define optimizer
    optimizer = optim.AdamW([
        {'params': model.deberta.parameters(), 'lr': 1e-5},
        {'params': model.classifier.parameters(), 'lr': 1e-4}
    ])

    # Training and Validation Loop
    max_num_epochs = 20
    early_stopping_count = 0
    best_loss = float('inf')

    for epoch in range(max_num_epochs):
        # Training logic
        model.train()
        for batch in train_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=masks, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Validation logic
        model.eval()
        val_loss, val_f1, val_precision, val_recall = 0, 0, 0, 0
        for batch in val_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks, labels=labels)
            val_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            val_f1 += f1_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            val_precision += precision_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            val_recall += recall_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)

        val_loss /= len(val_dataloader)
        val_f1 /= len(val_dataloader)
        val_precision /= len(val_dataloader)
        val_recall /= len(val_dataloader)
        print(f"Validation Loss: {val_loss}, F1: {val_f1}, Precision: {val_precision}, Recall: {val_recall}")

        # Early stopping logic
        if val_loss < best_loss:
            best_loss = val_loss
            early_stopping_count = 0
            torch.save({'model_state_dict': model.state_dict()}, 'deberta_best_model.pt')
        else:
            early_stopping_count += 1

        if early_stopping_count >= 5:
            break

    # Load the best model
    checkpoint = torch.load('deberta_best_model.pt')
    model.load_state_dict(checkpoint['model_state_dict'])

    # Testing
    model.eval()
    test_loss, test_accuracy, test_f1, test_precision, test_recall = 0, 0, 0, 0, 0

    with torch.no_grad():
        for batch in test_dataloader:
            batch = [b.to(device) for b in batch]
            inputs, masks, labels = batch
            outputs = model(inputs, attention_mask=masks, labels=labels)
            test_loss += outputs.loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            test_accuracy += accuracy_score(labels.cpu(), preds.cpu())
            test_f1 += f1_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            test_precision += precision_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)
            test_recall += recall_score(labels.cpu(), preds.cpu(), average='weighted', zero_division=0)

    test_loss /= len(test_dataloader)
    test_accuracy /= len(test_dataloader)
    test_f1 /= len(test_dataloader)
    test_precision /= len(test_dataloader)
    test_recall /= len(test_dataloader)

    print(f"Test Metrics: Loss: {test_loss}, Accuracy: {test_accuracy}, F1: {test_f1}, Precision: {test_precision}, Recall: {test_recall}")

    if save_model_path:
        model.save_pretrained(save_model_path)
        tokenizer.save_pretrained(save_model_path)

    return [seed, feature, test_loss, test_accuracy, test_f1, test_precision, test_recall]


In [8]:
def save_metrics_to_csv(metrics: list, output_file: str):
    """
    Save metrics to a CSV file.

    Args:
        metrics (list): List of dictionaries containing model evaluation metrics.
        output_file (str): Path to save the CSV file.
    """
    df = pd.DataFrame(metrics)  # Convert list of dictionaries to DataFrame
    df.to_csv(output_file, index=False)  # Save to CSV without an index
    print(f"Metrics saved to {output_file}")

In [None]:
gpu_numbers = "1" # Depending on the number of GPUs available. 0 for single GPU, "0,1" for multiple GPUs, None for CPU
seed = 42 # TODO: set the seed for reproducibility
save_model_path1 = "./models/"
features = ["sentiment_label", "time_label", "certain_label"] # TODO: Add the list of features to fine-tune for (e.g. (sentiment_label, time_label, certain_label))
all_results = []


print('roberta-large results:')
for feature in features:
    print(f"Fine-tuning for feature: {feature}")
    results = fine_tune_plm(gpu_numbers=gpu_numbers, seed=seed, feature=feature, save_model_path=save_model_path + feature)
    print(f"Results for {feature}: {results}")

all_results.append({
        "model": "roberta-large",
        "feature": feature,
        "seed": seed,
        "test_loss": results[2],
        "test_accuracy": results[3],
        "test_f1": results[4],
        "test_precision": results[5],
        "test_recall": results[6],
    })

output_csv = "overall_metrics.csv"
save_metrics_to_csv(all_results, output_csv)
