In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
import glob
import pandas as pd
#find rmsd
fragment_path = "/home/pc/Documents/combind_fragment/combind_fragment/fragment_dataset_add_bond_orders/*"

In [None]:
import os

def compute_cross_folder(stats_root, fragment_folder, poses_folder, protein_name_position, feature_names, rmsd_cutoff):
    """
    Compute statistics for poses in folder B with respect to near-native fragments in folder A
    """
    feature_names = feature_names.split(',')
    
    # Extract protein name
    protein = poses_folder.split('/')[protein_name_position]
    output_dir = os.path.join(stats_root, protein)
    os.makedirs(output_dir, exist_ok=True)
    print(f'Writing output to {output_dir}')
    
    # Load features from both folders
    features_A = Features(fragment_folder + "/features")
    features_A.load_features()
    
    features_B = Features(poses_folder + "/features")
    features_B.load_features()
    
    # Identify near-native fragments in folder A
    native_fragments_A = features_A.raw['rmsd1'] <= rmsd_cutoff
    native_fragment_names = features_A.raw['name1'][native_fragments_A]
    
    print(f"Found {len(native_fragment_names)} near-native fragments in folder A")
    
    # Now we need to compute pairwise features between A and B
    # This requires either:
    # 1. Pre-computed cross-folder features, or
    # 2. Computing them on the fly
    
    # Option 1: If cross-folder features already exist
    cross_features_path = f"{poses_folder}/features_cross_{os.path.basename(fragment_folder)}"
    if os.path.exists(cross_features_path):
        cross_features = Features(cross_features_path)
        cross_features.load_features()
    else:
        # Option 2: Compute features on the fly
        # This would require loading the actual poses and computing features
        raise NotImplementedError("Cross-folder feature computation not implemented. "
                                  "Please run 'combind featurize' with both folders first.")
    
    # Filter for pairs where:
    # - name1 is from folder A (fragments)
    # - name2 is from folder B (poses)
    # - Not comparing the same molecule
    is_cross_pair = np.array([name1 in native_fragment_names 
                              for name1 in cross_features.raw['name1']])
    
    # Identify which poses from B are near-native
    native_poses_B = cross_features.raw['rmsd2'] <= rmsd_cutoff
    
    for feature in feature_names:
        if feature not in cross_features.raw:
            continue
            
        print(f'Density estimate for {feature}')
        
        # Mask off infinite values
        not_infinite = (cross_features.raw[feature] != float('inf'))
        
        # Native: pairs where fragment from A is near-native AND pose from B is near-native
        native_mask = is_cross_pair & native_poses_B & not_infinite
        nat_vals = cross_features.raw[feature][native_mask]
        
        # Reference: all valid cross-folder pairs
        reference_mask = is_cross_pair & not_infinite
        ref_vals = cross_features.raw[feature][reference_mask]
        
        # Set appropriate parameters
        if feature == 'mcss':
            sd = 0.03 * 6
            domain = (0, 6)
        else:
            sd = 0.03
            domain = (0, 1)
        
        # Create and save density estimates
        native_density = os.path.join(output_dir, f'native_{feature}.de')
        if not os.path.exists(native_density):
            nat = DensityEstimate(domain=domain, sd=sd).fit(nat_vals)
            print(f'Writing native density to {native_density} ({len(nat_vals)} values)')
            nat.write(native_density)
        
        reference_density = os.path.join(output_dir, f'reference_{feature}.de')
        if not os.path.exists(reference_density):
            ref = DensityEstimate(domain=domain, sd=sd).fit(ref_vals)
            print(f'Writing reference density to {reference_density} ({len(ref_vals)} values)')
            ref.write(reference_density)