# Stanford RNA 3D Folding - Submission Preparation

**Author**: Mauro Risonho de Paula Assumpção <mauro.risonho@gmail.com>  
**Created**: October 18, 2025 at 14:30:00  
**License**: MIT License  
**Kaggle Competition**: https://www.kaggle.com/competitions/stanford-rna-3d-folding  

---

**MIT License**

Copyright (c) 2025 Mauro Risonho de Paula Assumpção <mauro.risonho@gmail.com>

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

---

Final model preparation and submission file generation for the Stanford RNA 3D Folding competition, implementing enterprise-grade deployment protocols.

In [1]:
# Import essential libraries for submission preparation
import os
import warnings
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, cast
import pickle
import json
from datetime import datetime
import sys
PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.models import SimpleRNAPredictor, ModelConfig
from src.data_processing import RNADataProcessor

print('Essential libraries successfully imported.')


Essential libraries successfully imported.


## 1. Optimal Model Loading

Loading the highest-performing model from validation results, ensuring production-ready deployment with comprehensive performance verification.

In [2]:
# Load the best trained model
checkpoints_dir = Path('../checkpoints')
checkpoints_dir.mkdir(exist_ok=True)

MODEL_METADATA: Dict[str, Any] = {}
INFERENCE_DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def _load_registry(checkpoint_dir: Path) -> Dict[str, Any]:
    registry_path = checkpoint_dir / 'model_registry.json'
    if registry_path.exists():
        with open(registry_path, 'r') as fp:
            try:
                registry = json.load(fp)
                if isinstance(registry, list):
                    return {entry.get('name', f'model_{idx}'): entry for idx, entry in enumerate(registry)}
                if isinstance(registry, dict):
                    return registry
            except json.JSONDecodeError:
                warnings.warn('Unable to parse model_registry.json; falling back to default checkpoint selection.')
    return {}


def _resolve_checkpoint_path(checkpoint_dir: Path, default_name: str = 'best_model.pth') -> Tuple[Path, Dict[str, Any]]:
    registry = _load_registry(checkpoint_dir)
    if registry:
        ranked = sorted(
            registry.values(),
            key=lambda item: item.get('validation_score', float('-inf')),
            reverse=True
        )
        top = ranked[0]
        candidate_path = checkpoint_dir / top.get('checkpoint', default_name)
        if candidate_path.exists():
            return candidate_path, top
    candidate_path = checkpoint_dir / default_name
    if candidate_path.exists():
        return candidate_path, {'checkpoint': default_name}
    candidates = sorted(checkpoint_dir.glob('*.pth')) + sorted(checkpoint_dir.glob('*.ckpt'))
    if not candidates:
        raise FileNotFoundError('No checkpoint files found in the checkpoints directory.')
    latest = max(candidates, key=lambda path: path.stat().st_mtime)
    return latest, {'checkpoint': latest.name}


def load_best_model(checkpoint_dir: Path) -> Tuple[nn.Module, Dict[str, Any]]:
    checkpoint_path, metadata = _resolve_checkpoint_path(checkpoint_dir)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')

    model_config = metadata.get('model_config') or checkpoint.get('model_config') or {}
    model_class_name = metadata.get('model_class') or checkpoint.get('model_class') or 'SimpleRNAPredictor'

    if isinstance(model_config, dict):
        try:
            config = ModelConfig(**model_config)
        except TypeError:
            warnings.warn('Model config in checkpoint is incompatible with ModelConfig; using defaults.')
            config = ModelConfig()
    else:
        config = ModelConfig()
    model_cls = SimpleRNAPredictor
    model = model_cls(config)

    state_dict = checkpoint.get('state_dict', checkpoint)
    cleaned_state_dict = {}
    for key, value in state_dict.items():
        cleaned_key = key
        if cleaned_key.startswith('model.'):
            cleaned_key = cleaned_key[len('model.') :]
        cleaned_state_dict[cleaned_key] = value
    model.load_state_dict(cleaned_state_dict, strict=False)
    model.to(INFERENCE_DEVICE)
    model.eval()

    loaded_metadata = {
        'checkpoint_path': str(checkpoint_path),
        'model_class': model_class_name,
        'validation_score': metadata.get('validation_score') or checkpoint.get('validation_score'),
        'epoch': metadata.get('epoch') or checkpoint.get('epoch'),
        'config': config.__dict__,
    }
    return model, loaded_metadata


try:
    model, MODEL_METADATA = load_best_model(checkpoints_dir)
    print(f"Loaded model '{MODEL_METADATA['model_class']}' from {MODEL_METADATA['checkpoint_path']}")
    if MODEL_METADATA.get('validation_score') is not None:
        print(f"Validation score: {MODEL_METADATA['validation_score']}")
except FileNotFoundError as exc:
    warnings.warn(str(exc) + ' — creating a fresh SimpleRNAPredictor with random weights.')
    model = SimpleRNAPredictor()
    model.to(INFERENCE_DEVICE)
    model.eval()
    MODEL_METADATA = {
        'checkpoint_path': 'N/A',
        'model_class': 'SimpleRNAPredictor',
        'validation_score': None,
        'epoch': None,
        'config': model.config.__dict__,
    }

print('Best model ready for inference.')


    Found GPU0 NVIDIA GeForce GTX 1060 which is of cuda capability 6.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
NVIDIA GeForce GTX 1060 with CUDA capability sm_61 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_75 sm_80 sm_86 sm_90 sm_100 sm_120.
If you want to use the NVIDIA GeForce GTX 1060 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [3]:
# Load test datasets
data_dir = Path('../data/raw')

def _load_optional_csv(path: Path, fallback_name: str | None = None) -> pd.DataFrame:
    if path.exists():
        return pd.read_csv(path)
    if fallback_name is not None:
        alt_path = path.with_name(fallback_name)
        if alt_path.exists():
            return pd.read_csv(alt_path)
    raise FileNotFoundError(f'Unable to locate required dataset near {path}.')

try:
    test_sequences_df = _load_optional_csv(data_dir / 'test_sequences.csv', fallback_name='test.csv')
    sample_submission = _load_optional_csv(data_dir / 'sample_submission.csv')
except FileNotFoundError as exc:
    raise RuntimeError('Submission preparation requires test sequences and sample submission files.') from exc

sequence_lookup = dict(zip(test_sequences_df['target_id'], test_sequences_df['sequence']))
print(f"Test sequences loaded: {len(test_sequences_df)} entries")
print('Sample submission template loaded.')


Test sequences loaded: 12 entries
Sample submission template loaded.


## 2. Test Data Preprocessing

Application of standardized preprocessing pipelines to test datasets, ensuring consistency with training data transformations for optimal model performance.

In [4]:

# Load preprocessor and assemble inference dataset
processed_dir = Path('../data/processed')
processed_dir.mkdir(exist_ok=True)

preprocessor_path = processed_dir / 'preprocessor.pkl'
data_processor = RNADataProcessor(data_dir)

class IdentityPreprocessor:
    """Fallback preprocessor that performs identity transformation."""

    def transform(self, data):
        return data

if preprocessor_path.exists():
    with open(preprocessor_path, 'rb') as f:
        preprocessor = pickle.load(f)
    print(f'Loaded preprocessing pipeline from {preprocessor_path.name}')
else:
    preprocessor = IdentityPreprocessor()
    warnings.warn('Preprocessor not found; using identity transformation.')

transformed_sequences = preprocessor.transform(test_sequences_df)
if isinstance(transformed_sequences, pd.DataFrame):
    transformed_sequences = transformed_sequences['sequence']
if isinstance(transformed_sequences, pd.Series):
    transformed_sequences = transformed_sequences.tolist()
else:
    transformed_sequences = list(transformed_sequences)

encoded_tokens = [
    torch.tensor(data_processor.encode_sequence(seq), dtype=torch.long)
    for seq in transformed_sequences
]

class RNATestDataset(Dataset):
    """Dataset returning padded-ready tensors for inference."""

    def __init__(self, sequence_ids: Iterable[str], token_tensors: Iterable[torch.Tensor]):
        self.sequence_ids: List[str] = list(sequence_ids)
        self.token_tensors: List[torch.Tensor] = list(token_tensors)

    def __len__(self) -> int:
        return len(self.sequence_ids)

    def __getitem__(self, idx: int):
        return self.sequence_ids[idx], self.token_tensors[idx]

def inference_collate(batch):
    sequence_ids, tensors = zip(*batch)
    tensors = list(tensors)
    lengths = torch.tensor([tensor.size(0) for tensor in tensors], dtype=torch.long)
    padded = pad_sequence(tensors, batch_first=True, padding_value=4)
    return list(sequence_ids), padded, lengths

inference_dataset = RNATestDataset(test_sequences_df['target_id'], encoded_tokens)
print('Inference dataset prepared.')


Inference dataset prepared.




## 3. Prediction Generation

Generation of model predictions for test datasets using validated inference pipelines and production-grade error handling protocols.

In [5]:

# Function to generate predictions
def generate_predictions(model: nn.Module, test_data, batch_size: int = 32, device: Optional[torch.device] = None) -> Dict[str, np.ndarray]:
    """Generate predictions for test datasets and return a mapping id -> coordinates."""

    if isinstance(test_data, DataLoader):
        loader = test_data
    else:
        loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=inference_collate)

    device = device or next(model.parameters()).device
    predictions: Dict[str, np.ndarray] = {}

    model.eval()
    with torch.no_grad():
        for sequence_ids, tokens, lengths in loader:
            tokens = tokens.to(device)
            outputs = model(tokens).cpu().numpy()
            lengths_np = lengths.numpy()
            for idx, seq_id in enumerate(sequence_ids):
                valid_len = lengths_np[idx]
                predictions[seq_id] = outputs[idx, :valid_len]

    return predictions

# Generate predictions
prediction_dict = generate_predictions(model, inference_dataset, batch_size=64, device=INFERENCE_DEVICE)
print(f'Predictions generated for {len(prediction_dict)} sequences.')


AcceleratorError: CUDA error: no kernel image is available for execution on the device
Search for `cudaErrorNoKernelImageForDevice' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## 4. Post-processing and Validation

Implementation of post-processing algorithms and comprehensive validation procedures to ensure prediction quality and compliance with competition requirements.

In [6]:

def _smooth_sequence(coords: np.ndarray, window: int = 5) -> np.ndarray:
    if coords.shape[0] < 2 or window <= 1:
        return coords
    half = window // 2
    smoothed = np.empty_like(coords)
    for idx in range(coords.shape[0]):
        start = max(0, idx - half)
        end = min(coords.shape[0], idx + half + 1)
        smoothed[idx] = coords[start:end].mean(axis=0)
    return smoothed


def _normalize_sequence(coords: np.ndarray) -> np.ndarray:
    centered = coords - coords.mean(axis=0, keepdims=True)
    norms = np.linalg.norm(centered, axis=1)
    max_norm = float(norms.max()) if norms.size > 0 else 0.0
    if max_norm > 0:
        centered = centered / max_norm * min(max_norm, 30.0)
    return centered


def _postprocess_single(coords: np.ndarray) -> np.ndarray:
    coords = np.clip(coords, -50, 50)
    coords = _smooth_sequence(coords, window=5)
    coords = _normalize_sequence(coords)
    return coords


def postprocess_predictions(predictions):
    """Apply post-processing to predictions.

    Supports numpy arrays or dict[str, np.ndarray].
    """
    if isinstance(predictions, dict):
        processed: Dict[str, np.ndarray] = {}
        for key, value in predictions.items():
            processed[key] = _postprocess_single(np.asarray(value, dtype=np.float32))
        return processed
    return _postprocess_single(np.asarray(predictions, dtype=np.float32))


def validate_predictions(predictions, sequences):
    """Validate predictions against known constraints."""
    issues = []

    if isinstance(predictions, dict):
        iterator = predictions.items()
    else:
        iterator = enumerate(predictions)

    for key, pred in iterator:
        pred = np.asarray(pred, dtype=np.float32)
        if pred.shape[0] < 2:
            continue
        distances = np.linalg.norm(pred[1:] - pred[:-1], axis=1)
        if np.any(distances < 0.5) or np.any(distances > 3.5):
            issues.append(f'{key}: suspicious bond distances')

    if isinstance(predictions, dict):
        if predictions:
            stacked = np.concatenate([pred for pred in predictions.values()], axis=0)
        else:
            stacked = np.empty((0, 3), dtype=np.float32)
    else:
        stacked = np.asarray(predictions)

    if np.any(np.isnan(stacked)) or np.any(np.isinf(stacked)):
        issues.append('Invalid coordinates found (NaN or Inf).')

    return issues


predictions_processed = postprocess_predictions(prediction_dict)
validation_issues = validate_predictions(predictions_processed, sequence_lookup)
if validation_issues:
    warnings.warn(f'Validation issues detected: {validation_issues[:5]}')
else:
    print('Predictions passed validation checks.')


NameError: name 'prediction_dict' is not defined

## 5. Submission File Formatting

Formatting predictions according to competition specifications, implementing data validation protocols and submission file generation procedures.

In [None]:

def format_submission(predictions: Dict[str, np.ndarray], sample_submission: pd.DataFrame) -> pd.DataFrame:
    """Format predictions for submission."""
    if not isinstance(predictions, dict):
        raise TypeError('Predictions must be provided as a dictionary mapping IDs to coordinate arrays.')

    submission = sample_submission.copy()
    coordinate_triplets = [
        ('x_1', 'y_1', 'z_1'),
        ('x_2', 'y_2', 'z_2'),
        ('x_3', 'y_3', 'z_3'),
        ('x_4', 'y_4', 'z_4'),
        ('x_5', 'y_5', 'z_5'),
    ]
    column_positions: Dict[str, int] = {}
    for pos, name in enumerate(submission.columns):
        if name not in column_positions:
            column_positions[name] = pos

    for row_idx in range(len(submission)):
        row = submission.iloc[row_idx]
        seq_id = row['ID'].rsplit('_', 1)[0]
        residue_index = int(row['resid']) - 1
        coords = predictions.get(seq_id)
        if coords is None or residue_index >= coords.shape[0]:
            base_coord = np.zeros(3, dtype=np.float32)
        else:
            base_coord = coords[residue_index]
        for triplet in coordinate_triplets:
            for col_name, value in zip(triplet, base_coord):
                submission.iat[row_idx, column_positions[col_name]] = float(value)

    return submission


if not isinstance(predictions_processed, dict):
    raise TypeError('Postprocessed predictions must be a dictionary for submission formatting.')
predictions_for_submission = cast(Dict[str, np.ndarray], predictions_processed)

submission = format_submission(predictions_for_submission, sample_submission)
print('Submission DataFrame assembled.')


In [None]:
# Save submission file
submissions_dir = Path('../submissions')
submissions_dir.mkdir(exist_ok=True)

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
submission_filename = f'submission_{timestamp}.csv'

submission_path = submissions_dir / submission_filename
submission.to_csv(submission_path, index=False)

print(f'Submission file saved: {submission_path}')


## 6. Final Validation and Metadata

Comprehensive final validation procedures and submission metadata generation for complete audit trail and reproducibility documentation.

In [None]:
# Final validation of submission file
def final_validation(submission_path: Path) -> Dict[str, bool]:
    """Final validation of submission file."""
    submission = pd.read_csv(submission_path)
    required_columns = ['ID', 'resname', 'resid']
    column_check = all(col in submission.columns for col in required_columns)
    numeric_check = submission.select_dtypes(include=[np.number]).shape[1] >= 3
    return {
        'correct_format': column_check,
        'no_null_values': not submission.isnull().any().any(),
        'correct_size': len(submission) == len(sample_submission),
        'numeric_values': numeric_check,
    }

validation_results = final_validation(submission_path)
print('Final validation checks:')
for key, value in validation_results.items():
    print(f'  - {key}: {value}')


In [None]:

# Create submission metadata
submission_metadata = {
    'timestamp': datetime.now().isoformat(),
    'model_type': MODEL_METADATA.get('model_class', 'Unknown'),
    'preprocessing': preprocessor_path.name if preprocessor_path.exists() else 'identity',
    'postprocessing': 'clipping + smoothing + normalization',
    'validation_score': MODEL_METADATA.get('validation_score'),
    'training_epochs': MODEL_METADATA.get('epoch'),
    'notes': 'Auto-generated submission pipeline execution.',
    'files': {
        'submission': submission_filename,
        'model': MODEL_METADATA.get('checkpoint_path'),
        'preprocessor': preprocessor_path.name if preprocessor_path.exists() else None,
    }
}

# Save metadata
metadata_filename = f'submission_metadata_{timestamp}.json'
metadata_path = submissions_dir / metadata_filename
with open(metadata_path, 'w') as f:
    json.dump(submission_metadata, f, indent=2)

print(f'Metadata saved: {metadata_path}')


## 7. Submission Summary and Next Steps

Comprehensive submission summary with performance metrics and strategic recommendations for future development iterations.

In [None]:
print('=== SUBMISSION SUMMARY ===')
print(f'File: {submission_filename}')
print(f"Model: {MODEL_METADATA.get('model_class', 'Unknown')}")
print(f"Validation score: {MODEL_METADATA.get('validation_score')}")
print(f'Timestamp: {timestamp}')
print(f'Metadata file: {metadata_filename}')
