In [3]:
import json

from rdkit import Chem
from rdkit.Chem import RDConfig
import os
import sys
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer


In [4]:
folders = [
#     '/data/ongh0068/l1000/FYP-DrugDiscoveryWithDeepLearning/distribution_learning_organised/tl_l1000',
    '/data/ongh0068/l1000/FYP-DrugDiscoveryWithDeepLearning/distribution_learning_benchmark'
]

json_file_to_data = {}

for parent_folder in folders:
    for json_file in os.listdir(parent_folder):
        full_json_file_path = os.path.join(parent_folder, json_file)
        json_file_content = open(full_json_file_path)
        data = json.load(json_file_content)


        json_file_to_data[parent_folder.split('/')[-1].split('_')[0] +'_' +  json_file] = data
    

In [5]:
from rdkit.Chem import Draw

tmp = {}

import matplotlib.pyplot as plt
for file_name, data in json_file_to_data.items():
    benchmark_scores = {}
    for benchmark in data['results']:
        name = benchmark['benchmark_name'].lower()
        score = benchmark['score']
        benchmark_scores[name] = score
    tmp[file_name] = benchmark_scores

    smiles = data['samples']

    smiles = [smile for smile in smiles if '.' not in smile]
    mols = [Chem.MolFromSmiles(smile) for smile in smiles]
    
    pct_connected = len(smiles)/100
    tmp[file_name]['pct_connected'] = pct_connected
    
#     img = Draw.MolsToGridImage(mols[:24], subImgSize=(800,800), maxMols = 100, molsPerRow=8, returnPNG = True)
#     break
    # # #     img.save(f"/data/ongh0068/l1000/FYP-DrugDiscoveryWithDeepLearning/molecule_imgs/{file_name.strip('.json')}.svg")
#     with open(f"/data/ongh0068/l1000/FYP-DrugDiscoveryWithDeepLearning/molecule_imgs/{file_name.strip('.json')}.png", 'wb') as png:
#         png.write(img.data)
# img

In [6]:
file_to_smiles = {file: result['samples'] for file, result in json_file_to_data.items()}
from rdkit.Chem.QED import qed

def compute_sa_score(mols, threshold = 4.5):
    
    sa_scores = [sascorer.calculateScore(mol) for mol in mols]
    pct_easily_synthesized = len([score for score in sa_scores if score <=4.5])/len(sa_scores)
    return sa_scores, pct_easily_synthesized

for file in file_to_smiles:
    mols = [Chem.MolFromSmiles(smile) for smile in file_to_smiles[file]]
    
    qed_scores = [qed(mol) for mol in mols]
    sa_scores, pct_easily_synthesized = compute_sa_score(mols)
    tmp[file]['sa_scores'] = sa_scores
    tmp[file]['pct_easily_synthesized'] = pct_easily_synthesized
    
    tmp[file]['qed_scores'] = qed_scores
    
    


In [9]:
import pandas as pd

collated_results = pd.DataFrame(tmp).T

In [10]:
collated_results['avg'] = collated_results[['validity', 'uniqueness', 'novelty', 'kl divergence', 'frechet chemnet distance']].mean(axis=1)

In [15]:
collated_results

Unnamed: 0,validity,uniqueness,novelty,kl divergence,frechet chemnet distance,pct_connected,sa_scores,pct_easily_synthesized,qed_scores,avg
distribution_(dep) only_l1000_vae_distribution_learning_results.json,1.0,0.1927,0.3472,0.352732,0.003628,1.0,"[1.2006534495197254, 2.3479034547150075, 2.347...",1.0,"[0.6240969235528274, 0.7480847519703256, 0.748...",0.379252
distribution_(dep) tl_l1000_vae_no_oclr_distribution_learning_results.json,1.0,0.6851,0.9069,0.20213,0.008619,0.96,"[1.3717690098246216, 1.0, 1.6366758942163866, ...",0.98,"[0.4531479654842905, 0.45880627965754545, 0.47...",0.56055
distribution_aae_no_oclr_genstep_drop_distribution_learning_results.json,1.0,0.9979,0.9934,0.868493,0.266483,0.82,"[1.7640408111116113, 1.223558174344987, 2.6734...",0.93,"[0.7462618266508545, 0.819024741146282, 0.8359...",0.825255
distribution_aae_no_oclr_no_genstep_drop_eps8_distribution_learning_results.json,1.0,1.0,0.9979,0.796869,0.349227,0.97,"[1.610128343263323, 2.136933090732949, 1.43533...",0.83,"[0.7059717831375069, 0.7951448561418578, 0.692...",0.828799
distribution_aae_oclr_genstep_drop_distribution_learning_results.json,1.0,0.9999,0.997,0.824105,0.331389,0.94,"[2.7723123555279976, 3.0891091423671266, 2.546...",0.77,"[0.6407329604822894, 0.8333688618995713, 0.626...",0.830479
distribution_film_conv_distribution_learning_results_070223.json,1.0,0.9549,0.9677,0.471463,0.08738,0.85,"[2.324850509601001, 3.029688207894453, 2.27384...",0.97,"[0.4745494696119243, 0.5210601106354689, 0.450...",0.696289
distribution_fs_l1000_aae_best_distribution_learning_results.json,1.0,0.6696,0.9997,0.342405,0.000744,1.0,"[2.15647147811352, 2.5455947574377245, 2.74560...",0.84,"[0.39603861487183056, 0.3884977017613924, 0.42...",0.60249
distribution_fs_l1000_vae_best_distribution_learning_results.json,1.0,0.2939,0.5581,0.570839,0.004221,0.87,"[1.9861600297291062, 1.9861600297291062, 2.167...",1.0,"[0.7172286191574994, 0.7172286191574994, 0.776...",0.485412
distribution_fs_l1000_wae_best_distribution_learning_results.json,1.0,0.9609,0.9982,0.83814,0.122329,0.88,"[2.3545263113329895, 2.045223483381074, 2.2856...",0.88,"[0.7661053286603293, 0.7960646291005125, 0.579...",0.783914
distribution_gat_conv_distribution_learning_results_130223.json,1.0,0.8869,0.9581,0.420199,0.028582,0.88,"[7.3284153846153846, 7.3284153846153846, 3.565...",0.94,"[0.3597849378839701, 0.3597849378839701, 0.348...",0.658756


In [12]:
# collated_results.to_csv('guac_collated.csv', index = True)

In [None]:
import pandas as pd
dfs = []
for key in tmp:
    sa_scores = tmp[key]['sa_scores']
    qed_scores = tmp[key]['qed_scores']
    dist_type = [key.strip('.json') for _ in range(len(tmp[key]['sa_scores']))]
    df = pd.DataFrame([sa_scores, qed_scores, dist_type]).T
    df.columns = ['sa_score', 'qed_scores', 'dist_type']
    dfs.append(df)
    

In [None]:
df_guac_sa_scores = pd.concat(dfs)