In [8]:
"""
Feature normalization and aggregation for RNA structure prediction method comparison.
Data processing pipeline for benchmark analysis.
"""

import pandas as pd
import numpy as np
from typing import Dict, List

# Load dataset
df = pd.read_csv("../results/metrics/dataset3_merge.csv")

"""
Feature normalization and aggregation for RNA structure prediction benchmarks.
Calculates median performance metrics per method with directional normalization.
"""

import pandas as pd
import numpy as np
from pathlib import Path

# ------------------------------------------------------------------------------
# Configuration
# ------------------------------------------------------------------------------

# Feature optimization direction: 1 = lower values better, -1 = higher values better
FEATURE_DIRECTION = {
    # Structural accuracy metrics
    "RMSD": 1,                # Root Mean Square Deviation (lower better)
    "TM-score": -1,           # Template Modeling score (higher better)
    "lddt": -1,               # Local Distance Difference Test (higher better)
    "GDT-TS": -1,             # Global Distance Test Total Score (higher better)
    "clash": 1,               # Steric clashes (lower better)
    "BARNABA-eSCORE": -1,     # eSCORE from BARNABA (higher better)
    
    # Base-pairing accuracy metrics
    
    "INF-ALL": -1,            # INF score (higher better)
    "f1": -1,                 # F1 score for base pairs (higher better)
    "precision": -1,          # Precision for base pairs (higher better)
    "recall": -1,             # Recall for base pairs (higher better)
    
    # Energy-based scoring functions
    "RASP-ENERGY": 1,         # RASP energy (lower better)
    "cgRNASP": 1,             # Coarse-grained RNA statistical potential (lower better)
    "3drnascore": 1,          # 3DRNAscore (lower better)
    "RNA-BRiQ": 1,            # RNA-BRiQ score (lower better)
    "DFIRE": 1,               # DFIRE energy (lower better)
    
    
    # Machine learning based scores
    "ARES": 1,                # ARES score (lower better)
    "LociPARSE": -1,          # LociPARSE score (higher better)
    "RNArank": -1,            # RNArank score (higher better)
    "RNA3DCNN": 1,            # RNA3DCNN score (lower better)
    "tb_mcq": 1,              # tb_mcq score (lower better)
    "PAMnet": 1               # PAMnet score (lower better)
}

FEATURES = list(FEATURE_DIRECTION.keys())

# ------------------------------------------------------------------------------
# Core Functions
# ------------------------------------------------------------------------------

def calculate_median_metrics(df, features, direction_dict):
    """
    Calculate median performance metrics for each method.
    
    Parameters
    ----------
    df : DataFrame
        Input data with 'method' column and feature columns
    features : list
        List of feature names to analyze
    direction_dict : dict
        Dictionary mapping features to optimization direction
        
    Returns
    -------
    DataFrame
        Median metrics per method, with rows=methods, columns=features
    """
    median_data = {}
    
    for method in df["method"].unique():
        method_df = df[df["method"] == method]
        method_metrics = {}
        
        for feature in features:
            if feature in method_df.columns:
                values = method_df[feature] * direction_dict[feature]
                method_metrics[feature] = values.median()
            else:
                method_metrics[feature] = np.nan
        
        median_data[method] = method_metrics
    
    return pd.DataFrame(median_data).T


def normalize_for_visualization(df, direction_dict):
    """
    Normalize data to [0,1] range for visualization.
    
    Parameters
    ----------
    df : DataFrame
        Median metrics per method
    direction_dict : dict
        Dictionary mapping features to optimization direction
        
    Returns
    -------
    DataFrame
        Normalized data where 1 represents optimal performance
    """
    normalized = df.copy()
    
    for col in normalized.columns:
        if col in direction_dict:
            col_min, col_max = df[col].min(), df[col].max()
            
            if col_max - col_min == 0:
                normalized[col] = 0.5
                continue
            
            # Normalize so 1 = optimal performance
            normalized[col] = 1-(df[col] - col_min) / (col_max - col_min)
            
            if direction_dict[col] == -1:  # Lower values are better
                normalized[col] = normalized[col]
    
    return normalized


def create_feature_ranks(df, direction_dict):
    """
    Create ranking table for each feature.
    
    Parameters
    ----------
    df : DataFrame
        Median metrics per method
    direction_dict : dict
        Dictionary mapping features to optimization direction
        
    Returns
    -------
    DataFrame
        Ranking where 1 is best performance
    """
    ranks = {}
    
    for feature in df.columns:
        if feature in direction_dict:
            if direction_dict[feature] == 1:  # Lower values better
                ranks[feature] = df[feature].rank(ascending=True, method='min')
            else:  # Higher values better
                ranks[feature] = df[feature].rank(ascending=False, method='min')
    
    return pd.DataFrame(ranks, index=df.index)

# ------------------------------------------------------------------------------
# Main Analysis Pipeline
# ------------------------------------------------------------------------------

def main():
    """Execute complete analysis pipeline."""
    
    # Load data
    data_path = Path("../results/metrics/dataset3_merge.csv")
    df = pd.read_csv(data_path)
    
    print(f"Dataset loaded: {len(df)} structures, {df['method'].nunique()} methods")
    
    # Calculate median metrics
    median_df = calculate_median_metrics(df, FEATURES, FEATURE_DIRECTION)
    print(f"\nMedian metrics calculated for {len(median_df)} methods")
    

    # Normalize for visualization
    norm_df = normalize_for_visualization(median_df, FEATURE_DIRECTION)
    norm_path = Path("../results/metrics/dataset3_normalized.csv")
    norm_df.to_csv(norm_path)
    print(f"Normalized data saved: {norm_path}")
    

    # Summary statistics
    print("\n" + "="*50)
    print("ANALYSIS SUMMARY")
    print("="*50)
    print(f"Methods analyzed: {', '.join(median_df.index.tolist())}")
    print(f"Metrics analyzed: {len(FEATURES)}")
    print(f"Optimization directions:")
    print(f"  Lower values better: {sum(v == 1 for v in FEATURE_DIRECTION.values())} metrics")
    print(f"  Higher values better: {sum(v == -1 for v in FEATURE_DIRECTION.values())} metrics")
    
    return median_df, norm_df


if __name__ == "__main__":
    median_df, norm_df = main()

Dataset loaded: 222 structures, 14 methods

Median metrics calculated for 14 methods
Normalized data saved: ../results/metrics/dataset3_normalized.csv

ANALYSIS SUMMARY
Methods analyzed: Alphafold3, 3dRNA, SimRNA, RNAJP, Vfold, DRfold, RNAComposer, NuFold, trRosettaRNA, DeepFoldRNA, RhoFold, DRfold2, RoseTTAFold2NA, FARFAR2
Metrics analyzed: 21
Optimization directions:
  Lower values better: 11 metrics
  Higher values better: 10 metrics


In [7]:
norm_df

Unnamed: 0,RMSD,TM-score,lddt,GDT-TS,clash,BARNABA-eSCORE,INF-ALL,f1,precision,recall,...,cgRNASP,3drnascore,RNA-BRiQ,DFIRE,ARES,LociPARSE,RNArank,RNA3DCNN,tb_mcq,PAMnet
Alphafold3,,,,,0.079282,,,,,,...,0.0,0.45809,0.0,0.0,0.0,0.0,0.459596,0.0,0.0,0.0
3dRNA,,,,,0.331753,,,,,,...,0.712754,0.263158,0.264594,0.781104,0.573333,0.541667,0.085859,0.813569,0.392916,0.666667
SimRNA,,,,,0.777117,,,,,,...,0.505231,0.531514,0.447525,0.543519,0.613333,0.75,1.0,0.371273,0.154586,0.395981
RNAJP,,,,,0.353464,,,,,,...,0.532815,0.092917,0.061844,0.391756,0.106667,0.333333,0.338384,0.352887,0.101351,0.399527
Vfold,,,,,0.232627,,,,,,...,0.733607,0.574399,0.229091,0.623497,0.295556,0.354167,0.068182,0.503159,0.124898,0.546099
DRfold,,,,,1.0,,,,,,...,1.0,0.584146,1.0,0.641189,0.644444,0.583333,0.492424,0.45375,0.321867,0.347518
RNAComposer,,,,,0.389968,,,,,,...,0.718414,0.168291,0.116703,0.701992,0.471111,0.375,0.0,0.452769,0.176699,0.767139
NuFold,,,,,0.037094,,,,,,...,0.723759,0.348928,0.181888,0.533243,0.262222,0.375,0.833333,0.215478,0.033784,0.565012
trRosettaRNA,,,,,0.107925,,,,,,...,0.713298,0.062378,0.117637,1.0,0.577778,0.666667,0.689394,0.712245,0.199836,0.98227
DeepFoldRNA,,,,,0.172386,,,,,,...,0.874828,0.0,0.087487,0.71817,0.311111,0.625,0.75,0.246883,0.070229,0.320331


In [5]:
radar_data

NameError: name 'radar_data' is not defined

In [1]:
# ==============================================================================
# Radar Plot: Energy-Based Scoring Function Comparison
# 
# Data: Normalized performance metrics for RNA structure prediction methods
# Normalization: 0=worst, 1=best performance for each metric
# Metrics: RASP-ENERGY, cgRNASP, 3drnascore, RNA-BRiQ, DFIRE
#          (all energy-based scores where lower values indicate better quality)
# ==============================================================================

library(ggradar)
library(dplyr)
library(ggplot2)

# ------------------------------------------------------------------------------
# 1. Load normalized performance data
# ------------------------------------------------------------------------------
radar_data <- read.table("../results/metrics/dataset3_normalized.csv", 
                         sep = ",", header = TRUE, row.names = 1)

cat("Data dimensions:", dim(radar_data), "\n")
cat("Methods:", paste(rownames(radar_data), collapse = ", "), "\n")

# ------------------------------------------------------------------------------
# 2. Define energy-based scoring metrics for comparison
# ------------------------------------------------------------------------------
energy_metrics <- c("RASP.ENERGY", "cgRNASP", "X3drnascore", "RNA.BRiQ", "DFIRE")
available_metrics <- energy_metrics[energy_metrics %in% colnames(radar_data)]

# Prepare data for ggradar (requires 'Method' as first column)
plot_data <- radar_data[, available_metrics, drop = FALSE]
plot_data$Method <- rownames(plot_data)
plot_data <- plot_data[, c("Method", available_metrics)]

# ------------------------------------------------------------------------------
# 3. Define consistent color scheme for methods
# ------------------------------------------------------------------------------
method_colors <- c(
  '3dRNA' = "#F8766D",           # Red
  'Alphafold3' = "#E38900",      # Orange
  'DRfold' = "#C49A00",          # Yellow-orange
  'DRfold2' = "#99A800",         # Olive green
  'DeepFoldRNA' = "#53B400",     # Green
  'FARFAR2' = "#00BC56",         # Teal
  'NuFold' = "#00C094",          # Turquoise
  'RNAComposer' = "#00BFC4",     # Cyan
  'RNAJP' = "#00B6EB",           # Light blue
  'RhoFold' = "#06A4FF",         # Blue
  'RoseTTAFold2NA' = "#A58AFF",  # Purple
  'SimRNA' = "#DF70F8",          # Magenta
  'Vfold' = "#FB61D7",           # Pink
  'trRosettaRNA' = "#FF66A8"     # Rose
)

# Assign colors to existing methods
plot_colors <- method_colors[plot_data$Method]
plot_colors[is.na(plot_colors)] <- rainbow(sum(is.na(plot_colors)))

# ------------------------------------------------------------------------------
# 4. Generate publication-quality radar plot
# ------------------------------------------------------------------------------
pdf("../figures/dataset3_radar_energy_metrics.pdf", width = 6, height = 6)

ggradar(plot_data,
        values.radar = c("0", "0.5", "1"),
        grid.min = 0,
        grid.mid = 0.5,
        grid.max = 1,
        group.point.size = 1.5,
        group.line.width = 0.8,
        grid.label.size = 4,
        axis.label.size = 3.5,
        legend.position = "none",
        group.colours = plot_colors) +
  
  labs(title = "Energy-Based Scoring Functions") +
  theme_void(base_size = 9) +
  theme(
    plot.title = element_text(size = 10, face = "bold", hjust = 0.5,
                              margin = margin(b = 10)),
    text = element_text(family = "Arial")
  )

dev.off()
cat("Figure saved to: ../figures/dataset3_radar_energy_metrics.pdf\n")

# ------------------------------------------------------------------------------
# 5. Create separate legend file
# ------------------------------------------------------------------------------
pdf("../figures/method_legend.pdf", width = 3, height = 4)
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", 
     xlim = c(0, 1), ylim = c(0, nrow(plot_data)))
for (i in 1:nrow(plot_data)) {
  rect(0.1, i*0.25, 0.2, i*0.25 + 0.15, 
       col = plot_colors[i], border = NA)
  text(0.25, i*0.25 + 0.075, plot_data$Method[i], 
       adj = 0, cex = 0.7)
}
dev.off()
cat("Legend saved to: ../figures/method_legend.pdf\n")

“package ‘dplyr’ was built under R version 4.2.3”

Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


“package ‘ggplot2’ was built under R version 4.2.3”


Data dimensions: 14 21 
Methods: Alphafold3, 3dRNA, SimRNA, RNAJP, Vfold, DRfold, RNAComposer, NuFold, trRosettaRNA, DeepFoldRNA, RhoFold, DRfold2, RoseTTAFold2NA, FARFAR2 


“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

Figure saved to: ../figures/dataset3_radar_energy_metrics.pdf


Legend saved to: ../figures/method_legend.pdf


In [2]:
# ==============================================================================
# Radar Plot: Energy-Based Scoring Function Comparison
# 
# Data: Normalized performance metrics for RNA structure prediction methods
# Normalization: 0=worst, 1=best performance for each metric
# Metrics:"ARES", "LociPARSE", "RNArank", "RNA3DCNN","tb_mcq","PAMnet"
#         
# ==============================================================================

library(ggradar)
library(dplyr)
library(ggplot2)

# ------------------------------------------------------------------------------
# 1. Load normalized performance data
# ------------------------------------------------------------------------------
radar_data <- read.table("../results/metrics/dataset3_normalized.csv", 
                         sep = ",", header = TRUE, row.names = 1)

cat("Data dimensions:", dim(radar_data), "\n")
cat("Methods:", paste(rownames(radar_data), collapse = ", "), "\n")

# ------------------------------------------------------------------------------
# 2. Define deep learning scoring metrics for comparison
# ------------------------------------------------------------------------------
energy_metrics <- c("ARES", "LociPARSE", "RNArank", "RNA3DCNN","tb_mcq","PAMnet") 
available_metrics <- energy_metrics[energy_metrics %in% colnames(radar_data)]

# Prepare data for ggradar (requires 'Method' as first column)
plot_data <- radar_data[, available_metrics, drop = FALSE]
plot_data$Method <- rownames(plot_data)
plot_data <- plot_data[, c("Method", available_metrics)]

# ------------------------------------------------------------------------------
# 3. Define consistent color scheme for methods
# ------------------------------------------------------------------------------
method_colors <- c(
  '3dRNA' = "#F8766D",           # Red
  'Alphafold3' = "#E38900",      # Orange
  'DRfold' = "#C49A00",          # Yellow-orange
  'DRfold2' = "#99A800",         # Olive green
  'DeepFoldRNA' = "#53B400",     # Green
  'FARFAR2' = "#00BC56",         # Teal
  'NuFold' = "#00C094",          # Turquoise
  'RNAComposer' = "#00BFC4",     # Cyan
  'RNAJP' = "#00B6EB",           # Light blue
  'RhoFold' = "#06A4FF",         # Blue
  'RoseTTAFold2NA' = "#A58AFF",  # Purple
  'SimRNA' = "#DF70F8",          # Magenta
  'Vfold' = "#FB61D7",           # Pink
  'trRosettaRNA' = "#FF66A8"     # Rose
)

# Assign colors to existing methods
plot_colors <- method_colors[plot_data$Method]
plot_colors[is.na(plot_colors)] <- rainbow(sum(is.na(plot_colors)))

# ------------------------------------------------------------------------------
# 4. Generate publication-quality radar plot
# ------------------------------------------------------------------------------
pdf("../figures/dataset3_radar_deep_learning.pdf", width = 6, height = 6)

ggradar(plot_data,
        values.radar = c("0", "0.5", "1"),
        grid.min = 0,
        grid.mid = 0.5,
        grid.max = 1,
        group.point.size = 1.5,
        group.line.width = 0.8,
        grid.label.size = 4,
        axis.label.size = 3.5,
        legend.position = "none",
        group.colours = plot_colors) +
  
  labs(title = "Energy-Based Scoring Functions") +
  theme_void(base_size = 9) +
  theme(
    plot.title = element_text(size = 10, face = "bold", hjust = 0.5,
                              margin = margin(b = 10)),
    text = element_text(family = "Arial")
  )

dev.off()
cat("Figure saved to: ../figures/dataset3_radar_deep_learning_metrics.pdf\n")

# ------------------------------------------------------------------------------
# 5. Create separate legend file
# ------------------------------------------------------------------------------
pdf("../figures/method_legend.pdf", width = 3, height = 4)
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", 
     xlim = c(0, 1), ylim = c(0, nrow(plot_data)))
for (i in 1:nrow(plot_data)) {
  rect(0.1, i*0.25, 0.2, i*0.25 + 0.15, 
       col = plot_colors[i], border = NA)
  text(0.25, i*0.25 + 0.075, plot_data$Method[i], 
       adj = 0, cex = 0.7)
}
dev.off()
cat("Legend saved to: ../figures/method_legend.pdf\n")

Data dimensions: 14 21 
Methods: Alphafold3, 3dRNA, SimRNA, RNAJP, Vfold, DRfold, RNAComposer, NuFold, trRosettaRNA, DeepFoldRNA, RhoFold, DRfold2, RoseTTAFold2NA, FARFAR2 


“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostScript font database”
“font family 'Arial' not found in PostSc

Figure saved to: ../figures/dataset3_radar_deep_learning_metrics.pdf


Legend saved to: ../figures/method_legend.pdf
