In [1]:

from typing import List
import argparse
import os
import numpy as np
from pickle import load


from guacamol.distribution_matching_generator import DistributionMatchingGenerator
from guacamol.assess_distribution_learning import assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger

import pandas as pd

class RandomSmilesSampler(DistributionMatchingGenerator): #esto habria que desramdomizarlo
    """
    Generator that samples SMILES strings from a predefined list.
    """

    def __init__(self, molecules: List[str]) -> None:
        """
        Args:
            molecules: list of molecules from which the samples will be drawn
        """
        self.molecules = molecules

    def generate(self, number_samples: int) -> List[str]:
        return list(np.random.choice(self.molecules, size=number_samples, replace=False))


setup_default_logger()

description='Molecule distribution learning benchmark for random smiles sampler'
dist_file='Data/SmilesOri_FirstAlldatabase.smiles'
output_dir = None
suite='v2'

if output_dir is None:
    output_dir = os.getcwd()

    
smiles_list = pd.read_csv('mols_gen/250209_database_allmolecules_main_2_22_sinfps_timepred_2_22_sinfps_sinexplicit/all_generated_molecules.csv').smiles.to_list()

generator = RandomSmilesSampler(molecules=smiles_list)


json_file_path = os.path.join(output_dir, 'Data/distribution_learning_results.json')


#hacer un print del smiles tambien

assess_distribution_learning(generator,
                             chembl_training_file=dist_file,
                             json_output_file=json_file_path,
                             benchmark_version=suite)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
INFO : Benchmarking distribution learning, version v2
INFO : Number of benchmarks: 5
INFO : Running benchmark 1/5: Validity
INFO : Results for the benchmark "Validity":
INFO :   Score: 1.000000
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_valid': 10000}
INFO : Running benchmark 2/5: Uniqueness
INFO : Results for the benchmark "Uniqueness":
INFO :   Score: 0.999300
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_unique': 9993}
INFO : Running benchmark 3/5: Novelty
INFO : Results for the benchmark "N

In [1]:

from typing import List
import argparse
import os
import numpy as np
from pickle import load


from guacamol.distribution_matching_generator import DistributionMatchingGenerator
from guacamol.assess_distribution_learning import assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger

import pandas as pd

class RandomSmilesSampler(DistributionMatchingGenerator): #esto habria que desramdomizarlo
    """
    Generator that samples SMILES strings from a predefined list.
    """

    def __init__(self, molecules: List[str]) -> None:
        """
        Args:
            molecules: list of molecules from which the samples will be drawn
        """
        self.molecules = molecules

    def generate(self, number_samples: int) -> List[str]:
        return list(np.random.choice(self.molecules, size=number_samples, replace=False))




setup_default_logger()

description='Molecule distribution learning benchmark for random smiles sampler'
dist_file='Data/SmilesOri_FirstAlldatabase.smiles'
output_dir = None
suite='v2'

if output_dir is None:
    output_dir = os.getcwd()

# with open(dist_file, 'r') as smiles_file:
#     smiles_list = []
#     smiles_list = smiles_file.readlines()
    
smiles_list = pd.read_csv('mols_gen/250211_database_allmolecules_main_2_22_confps_timepred_2_22_confps_sinexplicit/all_generated_molecules.csv').smiles.to_list()

generator = RandomSmilesSampler(molecules=smiles_list)


json_file_path = os.path.join(output_dir, 'Data/distribution_learning_results.json')


#hacer un print del smiles tambien

assess_distribution_learning(generator,
                             chembl_training_file=dist_file,
                             json_output_file=json_file_path,
                             benchmark_version=suite)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
INFO : Benchmarking distribution learning, version v2
INFO : Number of benchmarks: 5
INFO : Running benchmark 1/5: Validity
INFO : Results for the benchmark "Validity":
INFO :   Score: 1.000000
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_valid': 10000}
INFO : Running benchmark 2/5: Uniqueness
INFO : Results for the benchmark "Uniqueness":
INFO :   Score: 0.998900
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_unique': 9989}
INFO : Running benchmark 3/5: Novelty
INFO : Results for the benchmark "N

In [4]:

from typing import List
import argparse
import os
import numpy as np
from pickle import load


from guacamol.distribution_matching_generator import DistributionMatchingGenerator
from guacamol.assess_distribution_learning import assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger

import pandas as pd


class SmilesSampler(DistributionMatchingGenerator): #esto habria que desramdomizarlo
    """
    Generator that samples SMILES strings from a predefined list.
    """

    def __init__(self, molecules: List[str]) -> None:
        """
        Args:
            molecules: list of molecules from which the samples will be drawn
        """
        self.molecules = molecules

    def generate(self, number_samples: int) -> List[str]:
        return self.molecules[:number_samples]



setup_default_logger()

description='Molecule distribution learning benchmark for random smiles sampler'
dist_file='Data/guacamol_v1_train.smiles'
output_dir = None
suite='v2'

if output_dir is None:
    output_dir = os.getcwd()

# with open(dist_file, 'r') as smiles_file:
#     smiles_list = []
#     smiles_list = smiles_file.readlines()
    
with open('Data/digress_guacamol_smiles.txt', 'r') as f:
    smiles_list = [line.strip() for line in f]

generator = SmilesSampler(molecules=smiles_list)


json_file_path = os.path.join(output_dir, 'Data/distribution_learning_results.json')


#hacer un print del smiles tambien

assess_distribution_learning(generator,
                             chembl_training_file=dist_file,
                             json_output_file=json_file_path,
                             benchmark_version=suite)


INFO : Benchmarking distribution learning, version v2
INFO : Number of benchmarks: 5
INFO : Running benchmark 1/5: Validity
INFO : Results for the benchmark "Validity":
INFO :   Score: 0.852100
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_valid': 8521}
INFO : Running benchmark 2/5: Uniqueness
INFO : Results for the benchmark "Uniqueness":
INFO :   Score: 0.852100
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_unique': 8521}
INFO : Running benchmark 3/5: Novelty
INFO : Results for the benchmark "Novelty":
INFO :   Score: 0.848200
INFO :   Sampling time: 0:00:01
INFO :   Metadata: {'number_samples': 10000, 'number_novel': 8482}
INFO : Running benchmark 4/5: KL divergence
INFO : Results for the benchmark "KL divergence":
INFO :   Score: 0.926076
INFO :   Sampling time: 0:00:01
INFO :   Metadata: {'number_samples': 10000, 'kl_divs': {'BertzCT': 0.007065721572074872, 'MolLogP': 0.02151486683421383, 'MolWt': 0.0040

In [3]:



from typing import List
import argparse
import os
import numpy as np
from pickle import load


from guacamol.distribution_matching_generator import DistributionMatchingGenerator
from guacamol.assess_distribution_learning import assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger

import pandas as pd

class SmilesSampler(DistributionMatchingGenerator): #esto habria que desramdomizarlo
    """
    Generator that samples SMILES strings from a predefined list.
    """

    def __init__(self, molecules: List[str]) -> None:
        """
        Args:
            molecules: list of molecules from which the samples will be drawn
        """
        self.molecules = molecules

    def generate(self, number_samples: int) -> List[str]:
        return self.molecules[:number_samples]



setup_default_logger()

description='Molecule distribution learning benchmark for random smiles sampler'
dist_file='Data/guacamol_v1_train.smiles'
output_dir = None
suite='v2'

if output_dir is None:
    output_dir = os.getcwd()

# with open(dist_file, 'r') as smiles_file:
#     smiles_list = []
#     smiles_list = smiles_file.readlines()
    
with open('Data/mol_samples_231108.txt', 'r') as f:
    smiles_list = [line.strip() for line in f]

generator = SmilesSampler(molecules=smiles_list)


json_file_path = os.path.join(output_dir, 'Data/distribution_learning_results.json')


#hacer un print del smiles tambien

assess_distribution_learning(generator,
                             chembl_training_file=dist_file,
                             json_output_file=json_file_path,
                             benchmark_version=suite)


INFO : Benchmarking distribution learning, version v2
INFO : Number of benchmarks: 5
INFO : Running benchmark 1/5: Validity
INFO : Results for the benchmark "Validity":
INFO :   Score: 1.000000
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_valid': 10000}
INFO : Running benchmark 2/5: Uniqueness
INFO : Results for the benchmark "Uniqueness":
INFO :   Score: 0.999200
INFO :   Sampling time: 0:00:00
INFO :   Metadata: {'number_samples': 10000, 'number_unique': 9992}
INFO : Running benchmark 3/5: Novelty
INFO : Results for the benchmark "Novelty":
INFO :   Score: 0.993800
INFO :   Sampling time: 0:00:01
INFO :   Metadata: {'number_samples': 10000, 'number_novel': 9938}
INFO : Running benchmark 4/5: KL divergence
INFO : Results for the benchmark "KL divergence":
INFO :   Score: 0.473495
INFO :   Sampling time: 0:00:01
INFO :   Metadata: {'number_samples': 10000, 'kl_divs': {'BertzCT': 2.8281577315751383, 'MolLogP': 2.076892817755942, 'MolWt': 7.5808145