## Step 1: Write yaml

## Step 2: RF diffusion

In [22]:
import yaml

with open('experiments-new.yaml', 'r') as file:
    experiment_definitions = yaml.safe_load(file)
print('export NUMDESIGNS=1000;')
print('cd $HOME2/crysalin-redux')
for definition in experiment_definitions.values():
    print('export EXPERIMENT={codename}4B; export HOTSPOTS=\'{hotspots}\';export CONTIGMAP=\'{contig}\'; sbatch rf.slurm.sh'.format(**definition))

export NUMDESIGNS=1000;
cd $HOME2/crysalin-redux
export EXPERIMENT=bed4B; export HOTSPOTS='[B37,B39,B41,B43,B70,B71]';export CONTIGMAP='[A5-9/14-32/A24-331/0 B1-651/0]'; sbatch rf.slurm.sh
export EXPERIMENT=bedbig4B; export HOTSPOTS='[B37,B39,B41,B43,B70,B71]';export CONTIGMAP='[A5-9/28-48/A24-331/0 B1-651/0]'; sbatch rf.slurm.sh
export EXPERIMENT=mid4B; export HOTSPOTS='[B43,B68,B70,B75,B147,B170,B173,B174]';export CONTIGMAP='[A5-40/3-20/A44-331/0 B1-651/0]'; sbatch rf.slurm.sh
export EXPERIMENT=side4B; export HOTSPOTS='[B124,B125,B126,B127,B171,B173,B174,B175,B176,B177,B178,B178,B182,B207,B209]';export CONTIGMAP='[A5-63/6-20/A67-331/0 B1-651/0]'; sbatch rf.slurm.sh
export EXPERIMENT=posttag4B; export HOTSPOTS='[A135,A139,A148,A149,A151,A158,A194,A258,B37,B39,B507,B564,B568]';export CONTIGMAP='[A5-160/2-20/A167-331/0 B1-651/0]'; sbatch rf.slurm.sh
export EXPERIMENT=pocketfill4B; export HOTSPOTS='[A205,A325,B568,B572,B576,B579,B587,B588]';export CONTIGMAP='[A5-25/5-10/A32-51/5-10/A57-8

## Step 3: ProteinMPNN

In [21]:
import json, os
from pathlib import Path
import pandas as pd
from IPython.display import display
import json

data = []
ongoing = []

def get_logged(name):
    path = Path('output') / name / 'log.jsonl'
    if not path.exists():
        return -1
    data = []
    for line in path.open('r'):
        try:
            if line.replace('\x00', '').strip():
                data.append(json.loads(line.replace('\x00', '').strip()))
        except Exception as error:
            pass
    if len(data) == 0:
        return -1
    return len(pd.DataFrame(data).target_name.unique())

for path in Path('output').glob('*'):
    if not path.is_dir() or path.name == '.ipynb_checkpoints':
        continue
    is_ongoing = path.stem in ongoing
    if (path / 'fixed_chains.json').exists():
        n_expected = len(json.load((path / 'fixed_chains.json').open('r')))
    else:
        n_expected = -1
    pdb_names = [p.stem for p in path.glob('*.pdb')]
    seqs_paths = [p.stem for p in (path / 'seqs').glob('*.fa') ] if (path / 'seqs').exists() else []
    unrelaxed_paths = [p.stem for p in (path / 'unrelaxed_pdbs').glob('*.pdb.gz') ] if (path / 'unrelaxed_pdbs').exists() else []
    relaxed_paths = [p.stem for p in (path / 'relaxed_pdbs').glob('*.pdb.gz') ] if (path / 'relaxed_pdbs').exists() else []
    tuned_paths = [p.stem for p in (path / 'tuned_pdbs').glob('*.pdb.gz') ] if (path / 'tuned_pdbs').exists() else []
    summary_paths = [p.stem for p in (path / 'summary').glob('*.pkl') ] if (path / 'summary').exists() else []
    n_analysed = get_logged(path.stem)
    data.append({'name': path.name, 
                 'n_pdbs': len(pdb_names),
                 'n_json': n_expected,
                 'n_seqs': len(seqs_paths) * 6, 
                 'n_seqs_missing': max(0, (len(pdb_names) - len(seqs_paths)) * 6),
                 'n_unrelaxed': len(unrelaxed_paths),
                 'n_relaxed': len(relaxed_paths),
                 'n_tuned': len(tuned_paths),
                 'n_analysed': n_analysed,
                 'n_thread_missing': max(0, len(seqs_paths) *5 - n_analysed),
                 'n_summary': len(summary_paths),
                 'n_summary_missing': max(0, len(tuned_paths) - len(summary_paths)),
                 
                })
df = pd.DataFrame(data).sort_values('n_summary', ascending=False)
df['is_four'] = df.name.str.contains('4')
display(df)

Unnamed: 0,name,n_pdbs,n_json,n_seqs,n_seqs_missing,n_unrelaxed,n_relaxed,n_tuned,n_analysed,n_thread_missing,n_summary,n_summary_missing,is_four
1,side,615,615,1518,2172,1236,1236,1236,1512,0,1236,0,False
28,bedbig4B,1000,440,4026,1974,843,835,819,1373,1982,794,25,True
16,side4,699,531,1776,2418,792,792,792,1584,0,792,0,True
24,side4B,699,60,2886,1308,804,798,781,1033,1372,755,26,True
23,bed4B,701,60,2910,1296,766,758,747,1648,777,724,23,True
27,posttag4B,704,60,2832,1392,515,508,499,2574,0,483,16,True
12,inner3,684,684,1608,2496,462,462,462,1602,0,462,0,False
26,pocketfill4B,695,60,2760,1410,469,468,461,2681,0,441,20,True
2,inner,613,613,3594,84,378,372,365,1782,1213,351,14,False
18,posttag4,703,534,1788,2430,228,228,228,1482,8,228,0,True


In [None]:
def assign_supergroup(name):
    for gname in 'bed pocket mid inner posttag side ref'.split():
        if gname in name:
            return gname
    else:
        return 'error'

df['supergroup'] = df.name.apply(assign_supergroup)

In [None]:
import plotly.express as px

px.bar(df, x='supergroup', y='n_analysed', title='N designs', template='plotly_white').show()

px.bar(df, x='supergroup', y='n_summary', title='N valid designs', template='plotly_white').show()

In [None]:
print('export WORKPATH_ROOT=/opt/xchem-fragalysis-2/mferla/crysalin-redux/output')
for name in df.loc[(df.n_thread_missing > 500)].sort_values('n_thread_missing').name:
    print(f'export WORKPATH=$WORKPATH_ROOT/{name}; sbatch /opt/xchem-fragalysis-2/mferla/crysalin-redux/proteinMPNN.slurm.sh;')

## Step 4: Thread

In [None]:
print('export WORKPATH_ROOT=/opt/xchem-fragalysis-2/mferla/crysalin-redux/output')
for name in df.loc[(df.n_seqs >= 1) & (df.n_thread_missing > 500)].sort_values('n_thread_missing').name:
    print(f'export EXPERIMENT={name}; sbatch /opt/xchem-fragalysis-2/mferla/crysalin-redux/tune.slurm.sh;')

In [None]:
for name in df.loc[(df.n_summary_missing > 50)].sort_values('n_summary_missing').name:
    print(f'export EXPERIMENT={name}; sbatch /opt/xchem-fragalysis-2/mferla/crysalin-redux/rescore.slurm.sh;')