# Stanford RNA 3D Folding - Full Pipeline Submission

**Implements the 14-Day Hybrid Expert Plan:**
1. Template Search (MMseqs2)
2. RhoFold+ Predictions (with multiple seeds)
3. DRfold2 Ab Initio Predictions
4. Energy Minimization (OpenMM)
5. Diverse Ensemble Selection

**Requirements:** Kaggle GPU (T4/P100 minimum)

In [None]:
# Install dependencies
!pip install -q einops biopython ml-collections dm-tree tqdm pyyaml scipy
!pip install -q openmm pdbfixer
!apt-get install -qq mmseqs2 > /dev/null 2>&1
print("Dependencies installed")

In [None]:
# Clone repositories
import os
os.makedirs('models', exist_ok=True)
os.makedirs('predictions', exist_ok=True)
os.makedirs('templates', exist_ok=True)
os.makedirs('relaxed', exist_ok=True)

# RhoFold+
if not os.path.exists('models/RhoFold'):
    !git clone --quiet https://github.com/ml4bio/RhoFold.git models/RhoFold
    %cd models/RhoFold
    !pip install -q -e .
    %cd ../..

# DRfold2
if not os.path.exists('models/DRfold2'):
    !git clone --quiet https://github.com/leeyang/DRfold2.git models/DRfold2

print("Repositories ready")

In [None]:
# Download model weights
!mkdir -p models/RhoFold/pretrained
if not os.path.exists('models/RhoFold/pretrained/RhoFold_pretrained.pt'):
    !wget -q https://huggingface.co/cuhkaih/rhofold/resolve/main/rhofold_pretrained_params.pt \
        -O models/RhoFold/pretrained/RhoFold_pretrained.pt
    print("RhoFold+ weights downloaded")

# DRfold2 weights
%cd models/DRfold2
!bash install.sh 2>/dev/null || echo "Note: DRfold2 setup may need adjustments"
%cd ../..

In [None]:
# Setup
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load data
test_seqs = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/test_sequences.csv')
sample_sub = pd.read_csv('/kaggle/input/stanford-rna-3d-folding-2/sample_submission.csv')
print(f"\nTest sequences: {len(test_seqs)}")
print(f"Submission rows: {len(sample_sub)}")

## Phase 1: Template Search (MMseqs2)

In [None]:
import re

# Create FASTA for test sequences
with open('templates/test.fasta', 'w') as f:
    for _, row in test_seqs.iterrows():
        f.write(f">{row['target_id']}\n{row['sequence']}\n")

# Download PDB sequences
!wget -q https://files.rcsb.org/pub/pdb/derived_data/pdb_seqres.txt.gz -O templates/pdb.txt.gz
!gunzip -f templates/pdb.txt.gz

# Extract RNA
with open('templates/pdb.txt', 'r') as f:
    content = f.read()

rna_count = 0
with open('templates/pdb_rna.fasta', 'w') as out:
    for entry in content.split('>')[1:]:
        lines = entry.strip().split('\n')
        if len(lines) < 2:
            continue
        header = lines[0]
        seq = ''.join(lines[1:]).upper().replace('T', 'U')
        if re.match('^[ACGU]+$', seq) and len(seq) >= 10:
            out.write(f'>{header.split()[0]}\n{seq}\n')
            rna_count += 1

print(f"Extracted {rna_count} RNA sequences")

# Run MMseqs2
!mmseqs easy-search templates/test.fasta templates/pdb_rna.fasta \
    templates/hits.m8 templates/tmp \
    --search-type 3 -e 1e-3 -s 7.5 --threads 4 \
    --format-output "query,target,pident,evalue" 2>/dev/null

# Parse results
if os.path.exists('templates/hits.m8') and os.path.getsize('templates/hits.m8') > 0:
    hits = pd.read_csv('templates/hits.m8', sep='\t', header=None,
                       names=['query', 'target', 'pident', 'evalue'])
    best_templates = hits.loc[hits.groupby('query')['pident'].idxmax()]
    print(f"Found templates for {len(best_templates)} targets")
    print(best_templates.head())
else:
    best_templates = pd.DataFrame()
    print("No templates found")

## Phase 2: RhoFold+ Predictions

In [None]:
import sys
sys.path.insert(0, 'models/RhoFold')

# Fix OpenMM imports
rhofold_relax = Path('models/RhoFold/rhofold/relax/relax.py')
if rhofold_relax.exists():
    content = rhofold_relax.read_text()
    if 'from simtk.openmm' in content and 'try:' not in content[:300]:
        content = content.replace(
            'from simtk.openmm.app import *\nfrom simtk.openmm import *\nfrom simtk.unit import *\nimport simtk.openmm as mm',
            '''try:\n    from simtk.openmm.app import *\n    from simtk.openmm import *\n    from simtk.unit import *\n    import simtk.openmm as mm\nexcept ImportError:\n    from openmm.app import *\n    from openmm import *\n    from openmm.unit import *\n    import openmm as mm'''
        )
        rhofold_relax.write_text(content)

from rhofold.rhofold import RhoFold
from rhofold.config import rhofold_config
from rhofold.utils.alphabet import Alphabet

# Load model
config = rhofold_config()
rhofold_model = RhoFold(config)
ckpt = torch.load('models/RhoFold/pretrained/RhoFold_pretrained.pt', map_location='cpu', weights_only=False)
rhofold_model.load_state_dict(ckpt['model'])
rhofold_model = rhofold_model.to(device).eval()
print("RhoFold+ loaded")

In [None]:
def run_rhofold(target_id, sequence, seeds=[42, 123, 456, 789, 1234]):
    """Run RhoFold+ with multiple seeds for diversity."""
    alphabet = Alphabet.get_default()
    predictions = []
    
    if len(sequence) > 1000:
        print(f"  Skipping {target_id} - too long ({len(sequence)} nt)")
        return predictions
    
    tokens = torch.tensor([[alphabet.get_idx(c) for c in sequence]]).to(device)
    
    for seed in seeds:
        torch.manual_seed(seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(seed)
            torch.cuda.empty_cache()
        
        try:
            with torch.no_grad():
                outputs = rhofold_model(
                    tokens=tokens,
                    rna_fm_tokens=tokens.clone(),
                    seq=sequence
                )
            
            coords = outputs['cord_tns_pred'][-1][0].cpu().numpy()
            c1_prime = coords[:, 1, :]  # C1' atom
            
            predictions.append({
                'seed': seed,
                'coords': c1_prime,
                'source': 'rhofold'
            })
        except Exception as e:
            if 'out of memory' in str(e).lower():
                print(f"  OOM at seed {seed}")
                torch.cuda.empty_cache()
                break
    
    return predictions

# Run predictions
rhofold_predictions = {}
for idx, row in tqdm(test_seqs.iterrows(), total=len(test_seqs), desc="RhoFold+"):
    preds = run_rhofold(row['target_id'], row['sequence'])
    if preds:
        rhofold_predictions[row['target_id']] = preds

print(f"\nRhoFold+ completed: {len(rhofold_predictions)} targets")

## Phase 3: DRfold2 Ab Initio (for difficult targets)

In [None]:
import subprocess

def run_drfold2(target_id, sequence):
    """Run DRfold2 ab initio prediction."""
    os.makedirs(f'predictions/drfold2/{target_id}', exist_ok=True)
    fasta_path = f'predictions/drfold2/{target_id}/input.fasta'
    
    with open(fasta_path, 'w') as f:
        f.write(f'>{target_id}\n{sequence}\n')
    
    try:
        result = subprocess.run(
            ['python', 'models/DRfold2/DRfold2.py',
             '-i', fasta_path,
             '-o', f'predictions/drfold2/{target_id}',
             '--device', device],
            capture_output=True, text=True, timeout=300
        )
        pdb_path = f'predictions/drfold2/{target_id}/pred.pdb'
        if os.path.exists(pdb_path):
            return pdb_path
    except Exception as e:
        pass
    return None

# Run DRfold2 on targets without RhoFold+ predictions or low-confidence
drfold2_predictions = {}
missing_targets = [t for t in test_seqs['target_id'] if t not in rhofold_predictions]
short_missing = [t for t in missing_targets 
                 if len(test_seqs[test_seqs['target_id']==t]['sequence'].values[0]) <= 500]

print(f"Running DRfold2 on {len(short_missing)} missing targets")

for target_id in tqdm(short_missing, desc="DRfold2"):
    seq = test_seqs[test_seqs['target_id']==target_id]['sequence'].values[0]
    pdb_path = run_drfold2(target_id, seq)
    if pdb_path:
        drfold2_predictions[target_id] = pdb_path

print(f"DRfold2 completed: {len(drfold2_predictions)} predictions")

## Phase 4: Energy Minimization (OpenMM)

In [None]:
try:
    from openmm.app import *
    from openmm import *
    from openmm.unit import *
except ImportError:
    from simtk.openmm.app import *
    from simtk.openmm import *
    from simtk.unit import *

from pdbfixer import PDBFixer

def coords_to_pdb(coords, sequence, output_path):
    """Convert coordinates to PDB."""
    with open(output_path, 'w') as f:
        for i, (coord, res) in enumerate(zip(coords, sequence)):
            x, y, z = coord
            f.write(f"ATOM  {i+1:5d}  C1' {res:3s} A{i+1:4d}    "
                    f"{x:8.3f}{y:8.3f}{z:8.3f}  1.00 50.00           C\n")
        f.write("END\n")

def energy_minimize(pdb_path, output_path, max_iter=200):
    """Run OpenMM energy minimization."""
    try:
        fixer = PDBFixer(filename=pdb_path)
        fixer.findMissingResidues()
        fixer.findMissingAtoms()
        fixer.addMissingAtoms()
        
        forcefield = ForceField('amber14-all.xml', 'implicit/gbn2.xml')
        system = forcefield.createSystem(fixer.topology, nonbondedMethod=NoCutoff)
        integrator = LangevinMiddleIntegrator(300*kelvin, 1/picosecond, 0.002*picoseconds)
        simulation = Simulation(fixer.topology, system, integrator)
        simulation.context.setPositions(fixer.positions)
        simulation.minimizeEnergy(maxIterations=max_iter)
        
        positions = simulation.context.getState(getPositions=True).getPositions()
        with open(output_path, 'w') as f:
            PDBFile.writeFile(simulation.topology, positions, f)
        return True
    except:
        import shutil
        shutil.copy(pdb_path, output_path)
        return False

# Minimize predictions
minimized = {}
for target_id, preds in tqdm(rhofold_predictions.items(), desc="Minimizing"):
    seq = test_seqs[test_seqs['target_id']==target_id]['sequence'].values[0]
    minimized[target_id] = []
    
    for pred in preds:
        pdb_path = f"predictions/{target_id}_s{pred['seed']}.pdb"
        relax_path = f"relaxed/{target_id}_s{pred['seed']}.pdb"
        
        coords_to_pdb(pred['coords'], seq, pdb_path)
        energy_minimize(pdb_path, relax_path)
        
        minimized[target_id].append({
            'coords': pred['coords'],
            'source': pred['source'],
            'seed': pred['seed']
        })

print(f"Minimization completed: {len(minimized)} targets")

## Phase 5: Diverse Ensemble Selection

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist

def load_coords(pdb_path):
    """Load C1' coordinates from PDB."""
    coords = []
    with open(pdb_path, 'r') as f:
        for line in f:
            if line.startswith('ATOM') and "C1'" in line:
                coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])])
    return np.array(coords) if coords else None

def compute_rmsd(c1, c2):
    if c1 is None or c2 is None or len(c1) != len(c2):
        return 100.0
    return np.sqrt(np.mean(np.sum((c1-c2)**2, axis=1)))

def select_diverse(predictions, n=5):
    """Select diverse predictions via clustering."""
    if len(predictions) <= n:
        return predictions
    
    # RMSD matrix
    coords = [p['coords'] for p in predictions]
    m = len(predictions)
    rmsd_mat = np.zeros((m, m))
    for i in range(m):
        for j in range(i+1, m):
            r = compute_rmsd(coords[i], coords[j])
            rmsd_mat[i,j] = rmsd_mat[j,i] = r
    
    try:
        Z = linkage(pdist(rmsd_mat), method='complete')
        clusters = fcluster(Z, t=n, criterion='maxclust')
        
        selected = []
        for c in range(1, n+1):
            members = [i for i, cl in enumerate(clusters) if cl == c]
            if members:
                selected.append(predictions[members[0]])
        
        while len(selected) < n:
            selected.append(predictions[len(selected) % len(predictions)])
        
        return selected[:n]
    except:
        return predictions[:n]

# Select ensembles
ensembles = {}
for target_id in tqdm(test_seqs['target_id'], desc="Ensemble"):
    all_preds = []
    
    if target_id in minimized:
        all_preds.extend(minimized[target_id])
    
    if target_id in drfold2_predictions:
        coords = load_coords(drfold2_predictions[target_id])
        if coords is not None:
            all_preds.append({'coords': coords, 'source': 'drfold2'})
    
    ensembles[target_id] = select_diverse(all_preds) if all_preds else []

print(f"Ensembles created: {len(ensembles)} targets")

## Phase 6: Generate Submission

In [None]:
submission_rows = []

for _, row in tqdm(sample_sub.iterrows(), total=len(sample_sub), desc="Building"):
    parts = row['ID'].rsplit('_', 1)
    target_id = parts[0]
    res_idx = int(parts[1]) - 1
    
    new_row = {'ID': row['ID'], 'resname': row['resname'], 'resid': row['resid']}
    ensemble = ensembles.get(target_id, [])
    
    for model_idx in range(1, 6):
        if model_idx <= len(ensemble):
            coords = ensemble[model_idx-1].get('coords')
            if coords is not None and res_idx < len(coords):
                x, y, z = coords[res_idx]
            else:
                x, y, z = 0.0, 0.0, 0.0
        elif len(ensemble) > 0:
            coords = ensemble[-1].get('coords')
            if coords is not None and res_idx < len(coords):
                x, y, z = coords[res_idx]
            else:
                x, y, z = 0.0, 0.0, 0.0
        else:
            x, y, z = 0.0, 0.0, 0.0
        
        new_row[f'x_{model_idx}'] = round(float(x), 3)
        new_row[f'y_{model_idx}'] = round(float(y), 3)
        new_row[f'z_{model_idx}'] = round(float(z), 3)
    
    submission_rows.append(new_row)

submission = pd.DataFrame(submission_rows)
cols = ['ID', 'resname', 'resid'] + [f'{c}_{i}' for i in range(1,6) for c in ['x','y','z']]
submission = submission[cols]

print(f"Submission shape: {submission.shape}")

In [None]:
# Validation
assert len(submission) == len(sample_sub), "Row count mismatch!"
assert not submission.isnull().any().any(), "Contains NaN!"

non_zero = (submission['x_1'] != 0).sum()
print(f"Non-zero predictions: {non_zero}/{len(submission)} ({100*non_zero/len(submission):.1f}%)")

# Save
submission.to_csv('submission.csv', index=False)
print("\nSaved submission.csv")
print(submission.head())

## Done!

Download `submission.csv` and submit to the competition.