In [1]:
import os
import os 
import sys
import numpy as np 
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import roc_curve, roc_auc_score, auc
import matplotlib.pyplot as plt
from typing import Tuple

#update constants in task_tracker.training.utils.constants and task_tracker.config.models
from task_tracker.training.utils.constants import TEST_ACTIVATIONS_DIR_PER_MODEL,TEST_CLEAN_FILES_PER_MODEL,TEST_POISONED_FILES_PER_MODEL
from task_tracker.training.dataset import ActivationsDatasetDynamicPrimaryText

#update paths of trained models in task_tracker.experiments_outputs
from task_tracker.experiments_outputs import LINEAR_PROBES_PATHS_PER_MODEL


FILES = 'test'
MODEL = 'llama3_70b'


# Path to output dirs 
output_dir = f'./images/{FILES}/{MODEL}'
metrics_output_dir = f'./metrics/{FILES}/{MODEL}'
os.makedirs(output_dir,exist_ok=True)
os.makedirs(metrics_output_dir,exist_ok=True)

metrics_file_path = os.path.join(metrics_output_dir, f'{MODEL}_{FILES}_model_metrics.csv')



print(f'{len(TEST_CLEAN_FILES_PER_MODEL[MODEL])} clean files processed.')
print(f'{len(TEST_POISONED_FILES_PER_MODEL[MODEL])} poisoned files processed.')




32 clean files processed.
32 poisoned files processed.


In [2]:
def load_evaluation_data(val_files_clean, val_files_poisoned, num_layers):
    print("Loading validation datasets.")
    print(num_layers)
    clean_dataset = ActivationsDatasetDynamicPrimaryText(val_files_clean, num_layers=num_layers, root_dir=TEST_ACTIVATIONS_DIR_PER_MODEL[MODEL])
    poisoned_dataset = ActivationsDatasetDynamicPrimaryText(val_files_poisoned, num_layers=num_layers, root_dir=TEST_ACTIVATIONS_DIR_PER_MODEL[MODEL])

    print("Processing validation datasets.")
    clean_diff = []
    for primary, clean_with_text in tqdm(clean_dataset):
        #models with bfloat16 
        if MODEL == 'phi3' or MODEL == 'llama3_70b':
            clean_diff.append((clean_with_text - primary).flatten().float().numpy())
        else:
            clean_diff.append((clean_with_text - primary).flatten().numpy())
    poisoned_diff = []
    for primary, poisoned_with_text in tqdm(poisoned_dataset):
        #models with bfloat16
        if MODEL == 'phi3' or MODEL == 'llama3_70b': #these were bfloat16
            poisoned_diff.append((poisoned_with_text - primary).flatten().float().numpy())
        else:
            poisoned_diff.append((poisoned_with_text - primary).flatten().numpy())
    X_validation = np.array(clean_diff + poisoned_diff)
    y_validation = [0]*len(clean_diff) + [1]*len(poisoned_diff)

    return X_validation, y_validation

In [3]:
def create_roc(fpr, tpr, roc_auc, num_layers):
    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'Receiver Operating Characteristic for Layer {num_layers}')
    plt.legend(loc="lower right")
        
    # Save the ROC curve as an image file
    roc_curve_path = os.path.join(output_dir, f'roc_curve_layer_{num_layers}.png')
    plt.savefig(roc_curve_path)
    plt.close()

In [4]:
def evaluation(model, model_name: str,  X_test, y_test, num_layers: Tuple[int, int]):
    print("Loading evaluation dataset.")
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    print(y_pred_prob[0:2000])
    roc_auc = auc(fpr, tpr)
    
    create_roc(fpr, tpr, roc_auc,num_layers)
    
    # Log ROC AUC score
    roc_auc_score_value = roc_auc_score(y_test, y_pred_prob)
    with open(metrics_file_path, 'a') as f:
        f.write(f"Model: {model_name}, Layer: {num_layers}, ROC AUC Score: {roc_auc_score_value}, ROC AUC: {roc_auc}, TPR: {tpr}, FPR: {fpr}\n")

    print(f"ROC AUC Score for Layer {num_layers}: {roc_auc_score_value}")  
    return fpr, tpr, roc_auc, roc_auc_score_value


In [5]:
import pickle 
model_name = f'{MODEL}_Logistic_Regression'

roc_data = []

for model_path, layer in LINEAR_PROBES_PATHS_PER_MODEL[MODEL].items():
    print(f"Processing model: {model_path}")
    
    # Load the model
    model = pickle.load(open(model_path, 'rb'))
        
    # Load evaluation data
    X_test, y_test = load_evaluation_data(TEST_CLEAN_FILES_PER_MODEL[MODEL], TEST_POISONED_FILES_PER_MODEL[MODEL], num_layers=(layer, layer))
    
    # Perform evaluation with ROC curve and ROC AUC score
    fpr, tpr, roc_auc , roc_auc_score_value = evaluation(model, model_name, X_test, y_test, layer)
    roc_data.append((fpr, tpr, roc_auc,roc_auc_score_value, layer))

Processing model: /share/projects/jailbreak-activations/linear_probing/training/llama3_70b/0/model.pickle
Loading validation datasets.
(0, 0)


In [None]:
import matplotlib
plt.rcParams.update({'font.size': 16})
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

roc_data.sort(key=lambda x: x[4])
plt.figure(figsize=(8, 5))
for fpr, tpr, roc_auc, roce_auc_score, n_layer in roc_data:
    plt.plot(fpr, tpr, lw=2, label=f'Layer {n_layer} (area = {roc_auc:.4f})')


plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)', fontsize=14)
plt.ylabel('True Positive Rate (TPR)', fontsize=14)

plt.legend(loc="lower right", fontsize=12)
plt.grid(True)


# Save the combined ROC curve plot
roc_curve_combined_path = os.path.join(output_dir, 'roc_curve_combined.pdf')

plt.savefig(roc_curve_combined_path, bbox_inches='tight')
plt.show()
plt.close()

print(f"Saved combined ROC curve plot to {roc_curve_combined_path}")