In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from omegaconf import OmegaConf
from models.flow_module import FlowModule
import torch
from data.pdb_dataloader import PdbDataModule
import glob
import GPUtil
from data import utils as du
import numpy as np
import tree
from data import so3_utils
from data import all_atom
from analysis import utils as au
from openfold.utils.superimposition import superimpose
import matplotlib.pyplot as plt
import pandas as pd
import shutil
from openfold.utils.superimposition import superimpose


In [3]:
def calc_novelty(novelty_path):
    foldseek_df = {
        'sample': [],
        'alntm': [],
    }
    with open(novelty_path) as file:
        for item in file:
            file, _, _, tm_score = item.split('\t')
            tm_score = float(tm_score)
            foldseek_df['sample'].append(file)
            foldseek_df['alntm'].append(tm_score)
    foldseek_df = pd.DataFrame(foldseek_df)
    novelty_summary = foldseek_df.groupby('sample').agg({'alntm': 'max'}).reset_index()
    return novelty_summary


# FrameFlow analysis

In [81]:
# 100 timestep
results_dir = '/data/rsg/chemistry/jyim/projects/flow-matching/inference_outputs/large_model_2023-09-30_15-47-47_epoch=244-step=241325/ts_100/frame_mpnn/ode'
all_csvs = glob.glob(os.path.join(results_dir, '*/top_samples.csv'))
all_results_csv = []
for csv in all_csvs:
    csv_df = pd.read_csv(csv)
    all_results_csv.append(csv_df)
results_df = pd.concat(all_results_csv)
designable_df = results_df[results_df.rmsd < 2.0]
designable_percent = (results_df.rmsd < 2.0).mean()
print(f'Designable percent: {designable_percent:.2f}. Designable number: {designable_df.shape[0]}')

# Write samples to a directory for diversity calculation.
write_dir = '/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100'
os.makedirs(write_dir, exist_ok=True)
pdb_list_path = os.path.join(write_dir, 'filename.txt')
with open(pdb_list_path, 'w') as f:
    for (_, row) in designable_df.iterrows():
        pdb_path = row.esmf_pdb_path.replace('./', '../')
        length = row.length
        sample_id = row.sample_id
        write_path = os.path.join(write_dir, f'len_{length}_id_{sample_id}.pdb')
        f.write(write_path + '\n')
        shutil.copy(pdb_path, write_path)

Designable percent: 0.42. Designable number: 288


./maxcluster64bit -l \
/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100/filename.txt \
-C 2 -in -Rl /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100/all_by_all_lite \
-TM -Tm 0.5 > /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100//maxcluster_results.txt

In [85]:
132 / 690

0.19130434782608696

In [None]:
foldseek easy-search /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100 \
/Mounts/rbg-storage1/users/jyim/programs/foldseek/pdb aln_noise_01_seqs_100_esmf.m8 tmpFolder \
--alignment-type 1 --format-output 'query,target,alntmscore,lddt' --tmscore-threshold 0.0 --exhaustive-search --max-seqs 10000000000

In [5]:
foldseek_results_path = '/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_100/aln_noise_01_seqs_100_esmf.m8'
novelty_df = calc_novelty(foldseek_results_path)
novelty_df.alntm.mean()

0.6541659649122806

In [76]:
# 500 timestep
results_dir = '/data/rsg/chemistry/jyim/projects/flow-matching/inference_outputs/large_model_2023-09-30_15-47-47_epoch=244-step=241325/ts_500/frame_mpnn/ode'
all_csvs = glob.glob(os.path.join(results_dir, '*/top_samples.csv'))
all_results_csv = []
for csv in all_csvs:
    csv_df = pd.read_csv(csv)
    all_results_csv.append(csv_df)
results_df = pd.concat(all_results_csv)
designable_percent = (results_df.rmsd < 2.0).mean()
designable_df = results_df[results_df.rmsd < 2.0]
print(f'Designable percent: {designable_percent:.2f}. Designable number: {designable_df.shape[0]}')

# Write samples to a directory for diversity calculation.
write_dir = '/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500'
os.makedirs(write_dir, exist_ok=True)
pdb_list_path = os.path.join(write_dir, 'filename.txt')
with open(pdb_list_path, 'w') as f:
    for (_, row) in designable_df.iterrows():
        pdb_path = row.esmf_pdb_path.replace('./', '../')
        length = row.length
        sample_id = row.sample_id
        write_path = os.path.join(write_dir, f'len_{length}_id_{sample_id}.pdb')
        f.write(write_path + '\n')
        shutil.copy(pdb_path, write_path)

Designable percent: 0.49. Designable number: 338


./maxcluster64bit -l \
/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500/filename.txt \
-C 2 -in -Rl /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500//all_by_all_lite \
-TM -Tm 0.5 > /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500//maxcluster_results.txt

In [86]:
133 / 690

0.1927536231884058

foldseek easy-search /data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500 \
/Mounts/rbg-storage1/users/jyim/programs/foldseek/pdb aln_noise_01_seqs_100_esmf.m8 tmpFolder \
--alignment-type 1 --format-output 'query,target,alntmscore,lddt' --tmscore-threshold 0.0 --exhaustive-search --max-seqs 10000000000

In [6]:
foldseek_results_path = '/data/rsg/chemistry/jyim/third_party/frameflow_samples/ts_500/aln_noise_01_seqs_100_esmf.m8'
novelty_df = calc_novelty(foldseek_results_path)
novelty_df.alntm.mean()

0.6541659649122806

# FrameDiff analysis

In [87]:
def read_samples(results_dir):
    all_csvs = []
    print(f'Reading samples from {results_dir}')
    for sample_length in os.listdir(results_dir):
        if '.' in sample_length:
            continue
        length_dir = os.path.join(results_dir, sample_length)
        length = int(sample_length.split('_')[1])
        for i,sample_name in enumerate(os.listdir(length_dir)):
            if '.' in sample_name:
                continue
            csv_path = os.path.join(length_dir, sample_name, 'self_consistency', 'sc_results.csv')
            if os.path.exists(csv_path):
                design_csv = pd.read_csv(csv_path, index_col=0)
                design_csv['length'] = length
                design_csv['sample_id'] = i
                all_csvs.append(design_csv)
    results_df = pd.concat(all_csvs)
    return results_df


def sc_filter(raw_df, metric):
    # Pick best self-consistency sample
    if metric == 'tm_score':
        df = raw_df.sort_values('tm_score', ascending=False)
        df['designable'] = df.tm_score.map(lambda x: x > 0.5)
    elif metric == 'rmsd':
        df = raw_df.sort_values('rmsd', ascending=True)
        df['designable'] = df.rmsd.map(lambda x: x < 2.0)
    else:
        raise ValueError(f'Unknown metric {metric}')
    df = df.groupby(['length', 'sample_id']).first().reset_index()
    percent_designable = df['designable'].mean()
    print(f'Percent designable: {percent_designable}')
    return df

In [89]:
# 500 timesteps
results_dir = '/Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10'
samples_df = read_samples(results_dir)
samples_df = samples_df[samples_df.sample_id < 8] # Ensure we only consider 8 sequences per backbone.

scrmsd_results = sc_filter(samples_df, 'rmsd')
designable_scrmsd = scrmsd_results[scrmsd_results.designable]

# Write samples to a directory for diversity calculation.
write_dir = '/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500'
os.makedirs(write_dir, exist_ok=True)
pdb_list_path = os.path.join(write_dir, 'filename.txt')
with open(pdb_list_path, 'w') as f:
    for (_, row) in designable_scrmsd.iterrows():
        pdb_path = row.sample_path.replace('./inference_outputs/scope_noise_scale_10', '/Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10')
        length = row.length
        sample_id = row.sample_id
        write_path = os.path.join(write_dir, f'len_{length}_id_{sample_id}.pdb')
        f.write(write_path + '\n')
        shutil.copy(pdb_path, write_path)


Reading samples from /Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10
Percent designable: 0.4221014492753623


./maxcluster64bit -l \
/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500/filename.txt \
-C 2 -in -Rl /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500//all_by_all_lite \
-TM -Tm 0.5 > /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500//maxcluster_results.txt

In [93]:
84 / scrmsd_results.shape[0]

0.15217391304347827

foldseek easy-search /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500 \
/Mounts/rbg-storage1/users/jyim/programs/foldseek/pdb aln_noise_01_seqs_100_esmf.m8 tmpFolder \
--alignment-type 1 --format-output 'query,target,alntmscore,lddt' --tmscore-threshold 0.0 --exhaustive-search --max-seqs 10000000000

In [7]:
foldseek_results_path = '/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_500/aln_noise_01_seqs_100_esmf.m8'
novelty_df = calc_novelty(foldseek_results_path)
novelty_df.alntm.mean()

0.6625827893175075

In [91]:
# 100 timesteps
results_dir = '/Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10_ts_100'
samples_df = read_samples(results_dir)
samples_df = samples_df[samples_df.sample_id < 8] # Ensure we only consider 8 sequences per backbone.

scrmsd_results = sc_filter(samples_df, 'rmsd')
designable_scrmsd = scrmsd_results[scrmsd_results.designable]

# Write samples to a directory for diversity calculation.
write_dir = '/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100'
os.makedirs(write_dir, exist_ok=True)
pdb_list_path = os.path.join(write_dir, 'filename.txt')
with open(pdb_list_path, 'w') as f:
    for (_, row) in designable_scrmsd.iterrows():
        pdb_path = row.sample_path.replace('./inference_outputs/scope_noise_scale_10_ts_100', '/Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10_ts_100')
        length = row.length
        sample_id = row.sample_id
        write_path = os.path.join(write_dir, f'len_{length}_id_{sample_id}.pdb')
        f.write(write_path + '\n')
        shutil.copy(pdb_path, write_path)


Reading samples from /Mounts/rbg-storage1/users/jyim/se3_diffusion_scope_results/scope_noise_scale_10_ts_100
Percent designable: 0.3894927536231884


./maxcluster64bit -l \
/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100/filename.txt \
-C 2 -in -Rl /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100//all_by_all_lite \
-TM -Tm 0.5 > /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100//maxcluster_results.txt

In [94]:
68 / scrmsd_results.shape[0]

0.12318840579710146

foldseek easy-search /data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100 \
/Mounts/rbg-storage1/users/jyim/programs/foldseek/pdb aln_noise_01_seqs_100_esmf.m8 tmpFolder \
--alignment-type 1 --format-output 'query,target,alntmscore,lddt' --tmscore-threshold 0.0 --exhaustive-search --max-seqs 10000000000

In [8]:
foldseek_results_path = '/data/rsg/chemistry/jyim/third_party/framediff_samples/ts_100/aln_noise_01_seqs_100_esmf.m8'
novelty_df = calc_novelty(foldseek_results_path)
novelty_df.alntm.mean()

0.6652191588785048

# GENIE results

In [4]:
def extract_ca(path):
    pdb_feats = du.parse_pdb_feats('a', path)
    return pdb_feats['bb_positions']

In [5]:
genie_samples_dir = '/Mounts/rbg-storage1/users/jyim/genie_results/eval_results/eval_results/ts_750'
designs_dir = os.path.join(genie_samples_dir, 'pdbs/*')
structures_dir = os.path.join(genie_samples_dir, 'structures')

In [6]:
designs_to_score = {
    'sample_id': [],
    'length': [],
    'min_rmsd': [],
    'pdb_path': []
}
all_samples = glob.glob(designs_dir)

for i,design_path in enumerate(all_samples):
    design_fname = os.path.basename(design_path).replace('.pdb', '')
    length, sample_id = [int(x) for x in design_fname.split('_')]
    if length < 60:
        continue
    # path_basename = '/'.join(design_path.split('/')[-2:])
    # design_path = os.path.join(genie_samples_dir, path_basename)
    design_pdb = extract_ca(design_path)
    res_mask = np.ones_like(design_pdb)[:, 0]
    max_tm = 0
    min_rmsd = 1000
    if i % 50 == 0:
        print(f'On {i}/{len(all_samples)}')
    for i in range(8):
        resample_path = os.path.join(structures_dir, f'{design_fname}-resample_{i}.pdb')
        resample_pdb = extract_ca(resample_path)
        _, rmsd = superimpose(
            torch.tensor(resample_pdb[None]),
            torch.tensor(design_pdb[None]),
            torch.tensor(res_mask[None])
        )
        if rmsd < min_rmsd:
            min_rmsd = rmsd
    designs_to_score['sample_id'].append(sample_id)
    designs_to_score['length'].append(length)
    designs_to_score['min_rmsd'].append(min_rmsd)
    designs_to_score['pdb_path'].append(design_path)
    
results_df = pd.DataFrame(designs_to_score)
designability = (results_df.min_rmsd < 2.0).mean()
print(f'Designability: {designability}')

On 0/790
On 50/790
On 100/790
On 150/790
On 200/790
On 250/790
On 300/790
On 350/790
On 400/790
On 450/790
On 500/790
On 550/790
On 600/790
On 650/790
On 700/790
Designability: 0.11014492753623188


In [8]:
genie_save_dir = '/data/rsg/chemistry/jyim/third_party/genie_samples/ts_750'
os.makedirs(genie_save_dir, exist_ok=True)
pdb_list_path = os.path.join(genie_save_dir, 'filename.txt')
designable_df = results_df[results_df.min_rmsd < 2.0]
with open(pdb_list_path, 'w') as f:
    for path in designable_df.pdb_path:
        fname = os.path.basename(path)
        write_path = os.path.join(genie_save_dir, fname)
        shutil.copy(path, write_path)
        f.write(write_path + '\n')

foldseek easy-search /data/rsg/chemistry/jyim/third_party/genie_samples/ts_750 \
/Mounts/rbg-storage1/users/jyim/programs/foldseek/pdb aln_noise_01_seqs_100_esmf.m8 tmpFolder \
--alignment-type 1 --format-output 'query,target,alntmscore,lddt' --tmscore-threshold 0.0 --exhaustive-search --max-seqs 10000000000

In [14]:
num_designable = (results_df.min_rmsd < 2.0).sum()

In [15]:
60 / num_designable

0.7894736842105263

In [None]:
./maxcluster64bit -l \
/data/rsg/chemistry/jyim/third_party/genie_samples/ts_750/filename.txt \
-C 2 -in -Rl /data/rsg/chemistry/jyim/third_party/genie_samples/ts_750/all_by_all_lite \
-TM -Tm 0.5 > /data/rsg/chemistry/jyim/third_party/genie_samples/ts_750/maxcluster_results.txt

In [10]:
foldseek_results_path = '/data/rsg/chemistry/jyim/third_party/genie_samples/ts_750/aln_noise_01_seqs_100_esmf.m8'
novelty_df = calc_novelty(foldseek_results_path)
novelty_df.alntm.mean()

0.5147700000000001