In [None]:
import pandas as pd
import glob
import os
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Okabe–Ito palette
cb_palette = [
    "#E69F00",  # Orange
    "#56B4E9",  # Sky Blue
    "#009E73",  # Bluish Green
    "#F0E442",  # Yellow
    "#0072B2",  # Blue
    "#D55E00",  # Vermillion
    "#CC79A7"   # Reddish Purple
]



In [None]:
directory = "../outputs"

csv_files = glob.glob(os.path.join(directory, "*_test.csv")) + \
            glob.glob(os.path.join(directory, "*_val.csv")) + \
            glob.glob(os.path.join(directory, "*_external.csv"))
dfs = {}

for file_path in csv_files:
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    dfs[file_name] = pd.read_csv(file_path)

dfs['reasoner_val'] = dfs['reasoner_val'].replace({"unapproved": 0 , "approved": 1})
dfs['reasoner_test'] = dfs['reasoner_test'].replace({"unapproved": 0 , "approved": 1})
dfs['reasoner_external'] = dfs['reasoner_external'].replace({"unapproved": 0 , "approved": 1})


print(f"Loaded {len(dfs)} DataFrames: {list(dfs.keys())}")

In [None]:

test_results = []
val_results = []
external_results = []

for name, df in dfs.items():
    y_true = df["y_true"]
    y_pred = df["y_pred"]

    # Compute metrics
    try:
        auc = roc_auc_score(y_true, y_pred)
    except ValueError:
        auc = np.nan  

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)  
    f1 = f1_score(y_true, y_pred, zero_division=0)

    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn + fp) if (tn + fp) > 0 else np.nan

    row = {
        "Model": name,
        "AUC": auc,
        "F1-score": f1,
        "Recall (Sensitivity)": recall,
        "Specificity": specificity,
        "Precision": precision,
    }

    
    if name.endswith("_test"):
        test_results.append(row)
    elif name.endswith("_val"):
        val_results.append(row)
    elif name.endswith("_external"):
        external_results.append(row)

test_df = pd.DataFrame(test_results).round(3)
test_df.to_csv("../outputs/tables/test_results.csv")
val_df = pd.DataFrame(val_results).round(3)
val_df.to_csv("../outputs/tables/validation_results.csv")
external_df = pd.DataFrame(external_results).round(3)
external_df.to_csv("../outputs/tables/external_results.csv")


In [None]:
# Create a mapping dictionary for the desired order
order_mapping = {
    'svm_test': 2,
    'knn_test': 1, 
    'xgboost_test': 3,
    'logestic_test': 0,
    'reasoner_test': 4
}

test_df['sort_order'] = test_df['Model'].map(order_mapping)
test_df = test_df.sort_values('sort_order').drop('sort_order', axis=1)

order_mapping = {
    'svm_val': 2,
    'knn_val': 1, 
    'xgboost_val': 3,
    'logestic_val': 0,
    'reasoner_val': 4
}
val_df['sort_order'] = val_df['Model'].map(order_mapping)
val_df = val_df.sort_values('sort_order').drop('sort_order', axis=1)

order_mapping = {
    'svm_external': 2,
    'knn_external': 1, 
    'xgboost_external': 3,
    'logestic_external': 0,
    'reasoner_external': 5 ,
    'ChemAp_external' :4
}
external_df['sort_order'] = external_df['Model'].map(order_mapping)
external_df = external_df.sort_values('sort_order').drop('sort_order', axis=1)


In [None]:
sns.set_theme(style="whitegrid", font_scale=1.2)

def plot_metrics(df, title):
    # Melt DataFrame for seaborn
    df_melted = df.melt(id_vars="Model", var_name="Metric", value_name="Score")

    # Optional: order metrics for consistent display
    metric_order = ["AUC", "F1-score", "Recall (Sensitivity)", "Specificity", "Precision" ]
    df_melted["Metric"] = pd.Categorical(df_melted["Metric"], categories=metric_order, ordered=True)

    # Plot with metrics on x-axis and models as hue, using Okabe–Ito palette
    plt.figure(figsize=(12, 6))
    sns.barplot(
        data=df_melted,
        x="Metric",
        y="Score",
        hue="Model",
        palette=cb_palette  # 
    )

    plt.title(title, fontsize=16)
    plt.ylabel("Score", fontsize=12)
    plt.xlabel("Metric", fontsize=12)
    plt.xticks(rotation=45, ha="right")
    plt.ylim(0, 1)
    plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(f"../outputs/figures/{title}.eps")
    plt.savefig(f"../outputs/figures/{title}.png")
    plt.show()




test_df["Model"] = test_df["Model"].str.replace(r"(_test|_val)$", "", regex=True)
val_df["Model"] = val_df["Model"].str.replace(r"(_test|_val)$", "", regex=True)
external_df["Model"] = external_df["Model"].str.replace(r"(_external)$", "", regex=True)
# Plot for test and validation DataFrames
plot_metrics(test_df, "Test Set Metrics")
plot_metrics(val_df, "Validation Set Metrics")
plot_metrics(external_df, "External Set Metrics")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(ax, y_true, y_pred, title, cmap):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap=cmap, cbar=False,
                xticklabels=['Pred 0', 'Pred 1'],
                yticklabels=['True 0', 'True 1'],
                ax=ax)
    ax.set_title(title, fontsize=14)
    ax.set_ylabel('True Label')
    ax.set_xlabel('Predicted Label')

# Separate test and validation datasets
test_dfs = {name: df for name, df in dfs.items() if name.endswith("_test")}
val_dfs = {name: df for name, df in dfs.items() if name.endswith("_val")}
external_dfs = {name: df for name, df in dfs.items() if name.endswith("_external")}


# Function to plot and save confusion matrices
def plot_and_save(dfs, dataset_type, cmap, output_file):
    # Determine subplot grid size
    n = len(dfs)
    cols = 2  # number of columns in the grid
    rows = (n + cols - 1) // cols  # ceiling division

    fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 5 * rows))
    axes = axes.flatten()  # Flatten axes for easy indexing

    for idx, (name, df) in enumerate(dfs.items()):
        y_true = df["y_true"]
        y_pred = df["y_pred"]
        clean_name = name.replace("_test", "").replace("_val", "")
        
        plot_confusion_matrix(axes[idx], y_true, y_pred, 
                             f"{clean_name}", cmap=cmap)

    # Hide any unused subplots
    for ax in axes[len(dfs):]:
        ax.axis('off')

    # Add main title for the entire figure
    plt.suptitle(f"Confusion Matrices for {dataset_type} Set", fontsize=16, y=1.05)

    plt.tight_layout()
    plt.savefig(f"{output_file}.eps")
    plt.savefig(f"{output_file}.eps")
    plt.show()
    plt.close()

# Plot and save for test set (using Greens)
plot_and_save(test_dfs, "Test", "Greens", "../outputs/figures/Test_ConfusionMatrices")

# Plot and save for validation set (using Blues)
plot_and_save(val_dfs, "Validation", "Blues", "../outputs/figures/Validation_ConfusionMatrices")

# Plot and save for external set (using Reds)
plot_and_save(external_dfs, "External", "Reds", "../outputs/figures/External_ConfusionMatrices")

In [None]:
def evaluate(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, zero_division=0)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)


    TN, FP, FN, TP = cm.ravel() if cm.shape == (2,2) else (0,0,0,0)


    specificity = TN / (TN + FP) if (TN + FP) > 0 else 0

    try:
        auc = roc_auc_score(y_true, y_pred)
    except ValueError:
        auc = 0.0  

    results = {
        "F1 Score": f1,
        "Precision": precision,
        "Recall": recall,
        "AUC": auc,
        "Specificity": specificity,
    }
    return results


# concat all dataframes. 
dfs = []

df_0 = pd.read_csv('../outputs/checkpoins_outputs/zeroshot_base.csv')
df_0['checkpoint'] = 0
dfs.append(df_0)

for ckpt in range(500, 15000, 500):
    filename = f'../outputs/checkpoins_outputs/prediction_df_1.0_0.9_9_{ckpt}_final_setting.csv'
    df = pd.read_csv(filename)
    df['checkpoint'] = ckpt
    dfs.append(df)

all_df = pd.concat(dfs, ignore_index=True)
all_df.reset_index(drop=True, inplace=True)
all_df_valid = all_df[all_df['y_pred'].isin(['approved', 'unapproved'])]

valid_percent = (all_df_valid.groupby('checkpoint').size()/451*100).reset_index(name='valid_y_pred')

all_df_valid['y_pred_bin'] = all_df_valid['y_pred'].map({'unapproved': 0, 'approved': 1})
all_df_valid['y_true_bin'] = all_df_valid['y_true'].map({'unapproved': 0, 'approved': 1})
results_by_checkpoint = []

for checkpoint, group in all_df_valid.groupby('checkpoint'):
    y_true = group['y_true_bin']
    y_pred = group['y_pred_bin']
    try:
        metrics = evaluate(y_true, y_pred)
        metrics['Checkpoint'] = checkpoint
        metrics['n_samples'] = len(group)
        results_by_checkpoint.append(metrics)
    except ValueError as e:
        print(f"Skipped checkpoint {checkpoint} due to error: {e}")
results_df = pd.DataFrame(results_by_checkpoint).sort_values('Checkpoint')
# Okabe–Ito palette


In [None]:
results_df.to_csv("../outputs/checkpoints_evaluations.csv", index=False)

In [None]:

sns.set_theme(style="whitegrid", font_scale=1.5)

results_df['Normalized Samples'] = results_df['n_samples'] / 451 *100

fig, ax1 = plt.subplots(figsize=(40, 10))

metrics_to_plot = [ 'F1 Score', 'Precision', 'Recall', 'AUC', 'Specificity']
for idx, metric in enumerate(metrics_to_plot):
    sns.lineplot(
        data=results_df,
        x='Checkpoint',
        y=metric,
        ax=ax1,
        label=metric,
        marker='o',
        linewidth=1.5,
        color=cb_palette[idx % len(cb_palette)]
    )

ax1.set_xticks(np.arange(0, 14501, 500))
ax1.set_xlim([0, 14500])
ax1.set_xlabel("Checkpoint")
ax1.set_ylabel("Evaluation Metrics")
ax1.set_title("Model Evaluation Metrics Across Checkpoints")
ax1.legend(loc='upper left')


ax2 = ax1.twinx()
sns.lineplot(
    data=results_df,
    x='Checkpoint',
    y='Normalized Samples',
    ax=ax2,
    color='black',
    linestyle='--',
    marker='o',
    label='Valid Samples (%)'
)
ax2.set_ylabel("Valid Samples (%)")
# Combine legends
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()
ax1.legend_.remove()
ax2.legend_.remove()

ax1.legend(
    handles1 + handles2,
    labels1 + labels2,
    loc='center left',
    bbox_to_anchor=(1.05, 0.5),
    title="Metrics & Samples"
)
plt.tight_layout()
plt.savefig("../outputs/figures/Checkpoints_Evaluations.eps")
plt.savefig("../outputs/figures/Checkpoints_Evaluations.png")

plt.show()


In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

with open("trainer_state.json", "r") as json_file:
    json_data = json.load(json_file)

# Assuming json_data is already loaded, extract log_history
data = json_data["log_history"]

# Filter data to only include steps that are multiples of 50
# filtered_data = [entry for entry in data if entry['step'] % 50 == 0]

# Extract steps and reward metrics from filtered data
steps = [entry['step'] for entry in data]
reward = [entry['reward'] for entry in data]
confident_score = [entry['rewards/confident_score_func'] for entry in data]
correctness = [entry['rewards/correctness_rewards'] for entry in data]
int_reward = [entry['rewards/int_reward_func'] for entry in data]
soft_format = [entry['rewards/soft_format_reward_func'] for entry in data]
xmlcount = [entry['rewards/xmlcount_reward_func'] for entry in data]

# Define exponential moving average function
def exponential_moving_average(data, alpha=0.1):
    """
    Calculate exponential moving average
    alpha: smoothing factor (0 < alpha <= 1)
    Lower alpha = more smoothing
    """
    ema = np.zeros_like(data)
    ema[0] = data[0]  # Initialize with first value
    for i in range(1, len(data)):
        ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1]
    return ema

# Smoothing factor for EMA (adjust as needed)
alpha = 0.05  # Lower value = more smoothing

# Colorblind-friendly palette
cb_palette = [
    "#E69F00",  # Orange
    "#56B4E9",  # Sky Blue
    "#009E73",  # Bluish Green
    "#F0E442",  # Yellow
    "#0072B2",  # Blue
    "#D55E00",  # Vermillion
    "#CC79A7"   # Reddish Purple
]

# List of reward metrics and their labels, using colors from cb_palette
metrics = [
    (reward, 'Total Reward', cb_palette[0]),
    (confident_score, 'Confidence-alignment', cb_palette[1]),
    (correctness, 'Correctness', cb_palette[2]),
    (int_reward, 'Interpretability', cb_palette[3]),
    (soft_format, 'Soft format compliance', cb_palette[4]),
    (xmlcount, 'XML format', cb_palette[5])
]

# Create a single figure with subplots (one column, one subplot per metric)
fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4 * len(metrics)), sharex=True)

# Ensure axes is a list for iteration, even with one subplot
if len(metrics) == 1:
    axes = [axes]

# Plot each metric in its own subplot
for ax, (metric_data, label, color) in zip(axes, metrics):
    # Plot original data faintly
    ax.plot(steps, metric_data, color=color, alpha=0.3, label=f'Raw')
    # Calculate and plot exponential moving average with thicker line
    ema_data = exponential_moving_average(metric_data, alpha)
    ax.plot(steps, ema_data, color=color, linewidth=4, label=f'EMA')
    
    # Customize the subplot
    ax.set_ylabel('Reward Values')
    ax.set_title(f'{label}')
    ax.legend(loc='lower right')
    ax.grid(True)

# Set x-axis label and ticks for the bottom subplot
axes[-1].set_xlabel('Steps')
max_step = max(steps)
# Set x-ticks to show every 500 steps (which are multiples of 50)
tick_interval = 500
axes[-1].set_xticks(range(0, max_step + 1, tick_interval))
axes[-1].tick_params(axis='x', rotation=90)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.savefig("../outputs/figures/rewards.eps")
plt.savefig("../outputs/figures/rewards.png")
# Show the plot
plt.show()

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

with open("trainer_state.json", "r") as json_file:
    json_data = json.load(json_file)

# Assuming json_data is already loaded, extract log_history
data = json_data["log_history"]

# Filter data to only include steps that are multiples of 50
# filtered_data = [entry for entry in data if entry['step'] % 50 == 0]

# Extract steps and reward metrics from filtered data
steps = [entry['step'] for entry in data]
reward = [entry['reward'] for entry in data]
confident_score = [entry['rewards/confident_score_func'] for entry in data]
correctness = [entry['rewards/correctness_rewards'] for entry in data]
int_reward = [entry['rewards/int_reward_func'] for entry in data]
soft_format = [entry['rewards/soft_format_reward_func'] for entry in data]
xmlcount = [entry['rewards/xmlcount_reward_func'] for entry in data]

# Define exponential moving average function
def exponential_moving_average(data, alpha=0.1):
    """
    Calculate exponential moving average
    alpha: smoothing factor (0 < alpha <= 1)
    Lower alpha = more smoothing
    """
    ema = np.zeros_like(data)
    ema[0] = data[0]  # Initialize with first value
    for i in range(1, len(data)):
        ema[i] = alpha * data[i] + (1 - alpha) * ema[i-1]
    return ema

# Smoothing factor for EMA (adjust as needed)
alpha = 0.05  # Lower value = more smoothing

# Colorblind-friendly palette
cb_palette = [
    "#E69F00",  # Orange
    "#56B4E9",  # Sky Blue
    "#009E73",  # Bluish Green
    "#F0E442",  # Yellow
    "#0072B2",  # Blue
    "#D55E00",  # Vermillion
    "#CC79A7"   # Reddish Purple
]

# List of reward metrics and their labels, using colors from cb_palette
metrics = [
    (reward, 'Total Reward', cb_palette[0]),
    (confident_score, 'Confidence-alignment', cb_palette[1]),
    (correctness, 'Correctness', cb_palette[2]),
    (int_reward, 'Interpretability', cb_palette[3]),
    (soft_format, 'Soft format compliance', cb_palette[4]),
    (xmlcount, 'XML format', cb_palette[5])
]

# Create a single figure with subplots (one column, one subplot per metric)
fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4 * len(metrics)), sharex=True)

# Ensure axes is a list for iteration, even with one subplot
if len(metrics) == 1:
    axes = [axes]

# Plot each metric in its own subplot
for ax, (metric_data, label, color) in zip(axes, metrics):
    # Plot original data faintly with dots
    ax.plot(steps, metric_data, color=color, alpha=0.3, label=f'Raw', marker='o')
    # Calculate and plot exponential moving average with thicker line
    ema_data = exponential_moving_average(metric_data, alpha)
    ax.plot(steps, ema_data, color=color, linewidth=4, label=f'EMA')
    
    # Customize the subplot
    ax.set_ylabel('Reward Values')
    ax.set_title(f'{label}')
    ax.legend(loc='lower right')
    ax.grid(True)

# Set x-axis label and ticks for the bottom subplot
axes[-1].set_xlabel('Steps')
max_step = max(steps)
# Set x-ticks to show every 500 steps (which are multiples of 50)
tick_interval = 500
axes[-1].set_xticks(range(0, max_step + 1, tick_interval))
axes[-1].tick_params(axis='x', rotation=90)

# Adjust layout to prevent overlap
plt.tight_layout()
plt.savefig("../outputs/figures/rewards_with_dos.eps")
plt.savefig("../outputs/figures/rewards_with_dos.png")
# Show the plot
plt.show()