In [8]:
import requests
import pandas as pd
import operator

csets = pd.DataFrame( requests.get('https://fragalysis.diamond.ac.uk/api/compound-sets/').json()['results'] )
targets = pd.DataFrame( requests.get('https://fragalysis.diamond.ac.uk/api/targets/').json()['results'] )
id2name = targets.set_index('id').title.to_dict()
csets['target_name'] = csets.target.map(id2name)
csets['sdf_name'] = csets.submitted_sdf.apply(os.path.split).apply(operator.itemgetter(-1))

## Download....

In [109]:
from rdkit import Chem
from pathlib import Path

def retrieve(sdf_url):
    name = Path(sdf_url).parts[-1]
    if Path(name).exists():
        return name
    with open(name, 'w') as fh:
        response: requests.Response = requests.get(sdf_url)
        response.raise_for_status()
        fh.write( response.text )
    return name

hits = []
dataset_info = {}
filenames: pd.Series = csets.loc[csets.target_name == 'D68EV3CPROA'].submitted_sdf.apply(retrieve)
filename: str
for filename in filenames.to_list():
    name = filename.split('.')[0]
    with Chem.SDMolSupplier(filename) as sdfh:
        header = next(sdfh)
        dataset_info[name] = header.GetPropsAsDict()
        for vc in sdfh:
            hits.SetProp('dataset', name)

HTTPError: 404 Client Error: Not Found for url: https://fragalysis.diamond.ac.uk/media/code/media/compound_sets/compound-set_fragnetv1_sujHlIc.sdf

## Convert

In [140]:
from typing import Dict
import yaml

fragalysis2postera_yaml = \
'''
ref_url: design_submission_source_link
submitter_email: designer_email
# made up
submitter_name: designer_name
submitter_institution: designer_institution
generation_date: design_date
method: design_notes
# not matched to Fragalysis header
# related_molecules
'''

# made up: for each hit
# design_reference_hits: ref_mols
# design_rationale Computational follow-ups as part of Project 2.

fragalysis2postera: Dict[str, str] = yaml.load(fragalysis2postera_yaml, Loader=yaml.FullLoader)

In [141]:
# placeholder until download is fixed
filename = 'Fragmenstein-Tris.sdf'

from rdkit import Chem
from rdkit.Chem import AllChem

with Chem.SDMolSupplier(filename) as sdfh:
    header = next(sdfh)
    vcs = list(sdfh)[1:]
    
import re
for vc in vcs:
    ref_mols = ','.join(re.findall(r'(x\w+_\w\w)',  vc.GetProp('ref_mols') ))
    vc.SetProp('ref_mols', ref_mols)
        
#header_info = header.GetPropsAsDict()

def get_catalogue(name):
    if 'Z' in name:
        return name.split('-')[0]
    elif 'PV' in name:
        return 'PV-' + name.split('-')[1]
    else:
        raise ValueError


submissions = pd.DataFrame([{'Catalog ID': get_catalogue(vc.GetProp('_Name')),
                             'original_SMILES': Chem.MolToSmiles( AllChem.RemoveAllHs(vc)),
                             'design_reference_hits_XChem': vc.GetProp('ref_mols')
                            } for vc in vcs
                           ])

for col_fragalysis, col_post in fragalysis2postera.items():
    submissions[col_post] = header.GetProp(col_fragalysis)

In [142]:
import re
# get reference hits

with Chem.SDMolSupplier('D68EV3CPROA.filtered.sdf') as sdfh:
    hits = list(sdfh)
    
with Chem.SDMolSupplier('D68_new.sdf') as sdfh:
    hits.extend( list(sdfh) )
    
with Chem.SDMolSupplier('hits.sdf') as sdfh:
    hits.extend( list(sdfh) )
    
hitname2smiles = {h.GetProp('_Name'): Chem.MolToSmiles( AllChem.RemoveAllHs(h)) for h in hits}
hitname2smiles.update( {h.split('_')[0]: s for h, s in hitname2smiles.items()} )

def get_smiles(hit_name):
    """
    This would need to be custom...
    """
    if hit_name in hitname2smiles:
        return hitname2smiles[hit_name]
    simple = hit_name.split('_')[0]
    if simple in hitname2smiles:
        return hitname2smiles[simple]
    else:
        return '*'

In [144]:
submissions['designer_institution'] = 'University of Oxford'
submissions['design_rationale'] = 'Computational follow-ups as part of Project 2.'
ref_hits = submissions.design_reference_hits_XChem.apply(lambda s: s.split(','))
submissions['design_reference_hits_SMILES'] = ['.'.join([get_smiles(h) for h in hits]) for hits in ref_hits]

submissions.to_csv('test_set.csv')
submissions

Unnamed: 0,Catalog ID,original_SMILES,design_reference_hits_XChem,design_submission_source_link,designer_email,designer_name,designer_institution,design_date,design_notes,design_rationale,design_reference_hits_SMILES
0,Z814868448,Cc1cc(C(=O)NCC(C)(C)C)no1,"x1071_0A,x1052_1A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)NCC(=O)N1CCCCC1.COC(=O)CNC(=O)c1cc(C...
1,PV-004271221850,COC(=O)Nc1cccc(C(=O)NC2CCN(C(=O)CCC(C)(C)C)CC2)c1,"x1071_0A,x1020_0A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)NCC(=O)N1CCCCC1.*
2,PV-005890491569,CC(=O)Nc1cc(C(=O)NC2CCN(C(=O)CCC(C)(C)F)CC2)ccc1F,"x1071_0A,x1020_0A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)NCC(=O)N1CCCCC1.*
3,Z1763400558,CC(=O)NNC(=O)c1coc(C)c1,"x1071_0A,x1052_1A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)NCC(=O)N1CCCCC1.COC(=O)CNC(=O)c1cc(C...
4,PV-001573842640,CC(=O)NC[C@H](NC(=O)Cc1ccc(=O)[nH]c1)C1CCCCC1,"x1071_0A,x1498_0A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)NCC(=O)N1CCCCC1.Oc1ccccn1
...,...,...,...,...,...,...,...,...,...,...,...
158,Z2170211741,Cc1cc(NC(=O)CN[C@]2(C)CCOC2=O)no1,"x2149_0A,x1052_1A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,Cc1csc(CN[C@H]2CCNC2=O)n1.COC(=O)CNC(=O)c1cc(C...
159,Z1211401012,CC(=O)c1ccc(OCc2ccc3c(c2)CCC3)cc1,"x0771_1A,x1140_0A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,NC(=O)c1ccc2c(c1)CCN2.*
160,Z3777535315,CN1C(=O)CC[C@@H]1CNC(=O)NCc1ccc2c(c1)CCO2,"x2149_0A,x1140_0A",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,Cc1csc(CN[C@H]2CCNC2=O)n1.*
161,Z1260734436,CCc1ccc(CC(=O)NCCc2cccnc2)cc1,"x1084_0A,x0980_0B",https://github.com/matteoferla/Fragment-hit-fo...,matteo.ferla@stats.ox.ac.uk,Matteo Ferla,University of Oxford,2023-09-05,Fragmenstein-Tris,Computational follow-ups as part of Project 2.,CS(=O)(=O)Nc1ccc(CN)cc1.Cc1ccc(F)cc1NC(=O)c1cc...
