In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from matplotlib import pyplot as plt
import selfies as sf
import mutations as mut
from functools import partial
import metrics as met
import functions as fn
from constants import *
from rdkit import Chem
from rdkit.Chem import Draw
import seaborn as sns
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm
from guacamol import standard_benchmarks
import numpy as np
import random
from copy import copy
import crossovers as xo

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED_SIZE_LIST = [10, 100, 1000, 10000, 100000]
SEED_SIZE = SEED_SIZE_LIST[1]
SEED_LIST = [0,1,2]
SEED = SEED_LIST[2]
BUDGET_LIST = [100, 1000, 10000, 100000]
BUDGET = BUDGET_LIST[2]
GENERATIONS = 50




SAVE = True

# Single metric optimization

In [4]:
#f = standard_benchmarks.zaleplon_with_other_formula().objective.score_list()
celecoxib = 'O=S(=O)(c3ccc(n1nc(cc1c2ccc(cc2)C)C(F)(F)F)cc3)N'
troglitazone = 'O=C1NC(=O)SC1Cc4ccc(OCC3(Oc2c(c(c(O)c(c2CC3)C)C)C)C)cc4'
metric_function_list = [
    partial(standard_benchmarks.median_tadalafil_sildenafil().objective.score_list)
    #partial(met.compound_similarity,target_smile = troglitazone)
]
metrics = ['Metric 1']

In [5]:
mutation_function_list = [
    partial(mut.replacement)
    ,partial(mut.addition,fragment_size=1,rings=False)
    ,partial(mut.deletion,n=1)
]

In [6]:
hyperparameters_range = {
    'first_generation_fraction': 0.05,
    'crossover': True,
    'crossover_ratio': 0.2,
    'next_generation_fraction': 0.05,
    'initial_sample_fraction': 1.5,
    'minimal_next_generation_fraction': 0.0025,
    'max_gens': 500,
    'percent_best_fraction': 0.1,
    'include_initial_pop': False
}


In [15]:
current_dateTime = datetime.now()
time_format = f"{current_dateTime.date()}_{str(current_dateTime.hour).zfill(2)}-{str(current_dateTime.minute).zfill(2)}-{str(current_dateTime.second).zfill(2)}"
def eval(hyperparameters, seed=0):
    for SEED in [SEED_LIST[seed]]:#SEED_LIST[1:]:
        for SEED_SIZE in [SEED_SIZE_LIST[2]]:#SEED_SIZE_LIST[1:]:#[SEED_SIZE_LIST[1]]:#
            SEED_PATH = f"../data/seed_{SEED}/rand_{SEED_SIZE}.tsv"
            seed_df = pd.read_table(SEED_PATH) 
            initial_pop = fn.convert_seeds_to_df(seed_df,metric_function_list)
            for BUDGET in [BUDGET_LIST[1]]:#BUDGET_LIST[:4]:#BUDGET_LIST[:4]#
                setup_name = f"seed_{SEED}_rand_{SEED_SIZE}"
                file_name = f"{time_format}_{setup_name}_budget_{BUDGET}"
                B0 = copy(BUDGET)
                generation = 0
                t0 = time.time()
                
                first_generation_fraction = hyperparameters['first_generation_fraction'] 
                crossover_ratio = hyperparameters['crossover_ratio']
                next_generation_fraction = hyperparameters['next_generation_fraction']
                initial_sample_fraction = hyperparameters['initial_sample_fraction']
                minimal_next_generation_fraction = hyperparameters['minimal_next_generation_fraction']

                percent_best_fraction = hyperparameters['percent_best_fraction']

                N = int(BUDGET * first_generation_fraction) #1st generation size
                if len(initial_pop)>=N:
                    initial_best = initial_pop.head(N)
                else:
                    initial_best = initial_pop
                gen_history = pd.DataFrame(initial_best)
                temp_best = initial_best.copy(deep=True)
                cost_history = []


                pbar = tqdm(desc=f"Working with budget of {B0}",
                            total=B0)
                while (BUDGET > len(temp_best)):
                    #percentage_done = round(np.floor((B0-BUDGET)/B0*1000)/10,2)
                    #print(f"{percentage_done}% done")
                    diversity = fn.get_last_diversity(temp_best)
                    homogenity = 1 - diversity

                    #crossover_ratio =  0.1#0.01 + ((diversity)/10)
                    #print(f'crossover_ratio = {crossover_ratio}, N={N}')

                    temp_pop, cost = fn.populate_from_df(temp_best,N,metric_function_list,mutation_function_list,
                                                generation+1,include_seeds=True,fitness='Metric 1',crossover=True, crossover_type=0,
                                                crossover_ratio=crossover_ratio, fitness_proportional = True)
                    cost_history.append(cost)
                    BUDGET -= cost
                    temp_pop.reset_index(drop=True,inplace=True)


                    #if homogenity>0:
                    #    temp_best = fn.get_percent_best(temp_pop, metrics,0.05+((homogenity**2)/2),minimize=False)
                    #else:
                    temp_best = fn.get_percent_best(temp_pop, metrics,percent_best_fraction,minimize=False)
                    gen_history = pd.concat([gen_history,temp_best])
                    
                    
                    
                    if initial_sample_fraction>0:
                        initial_sample_size = int(np.ceil(  len(temp_best) *  initial_sample_fraction   ))
                        #initial_sample_size = 3 * int(np.ceil( len(temp_best)* homogenity**2  ))
                        #if initial_sample_size>len(initial_pop):
                        #    initial_sample_size = len(initial_pop)

                        temp_best = pd.concat([temp_best, initial_pop.sample(initial_sample_size)])

                
                    #if homogenity==0:
                    N = int(np.ceil(BUDGET * next_generation_fraction))
                        #print(f"homo {homogenity} , N {N}")
                    #else:
                    #    N = int(np.ceil(BUDGET * 0.1  * homogenity))
                        #print(f"no homo N {N}")
                    #print(f"------- {N} ------- BUDGET / GENERATIONS {BUDGET / GENERATIONS} * homogenity {homogenity} + 0.1  = {(BUDGET / GENERATIONS) * homogenity + 0.1}")
                    if N<=int(np.ceil(minimal_next_generation_fraction*B0)):
                        N = int(np.ceil(minimal_next_generation_fraction*B0))
                    pbar.update(cost)
                    generation += 1
                    if generation>=500:
                        break
                pbar.update(BUDGET)
                pbar.close()
                print(f"Done in {time.time()-t0}")
                #if SAVE:
                #    gen_history.to_csv("out_exp/"+file_name+".csv")

    ###
    latest_gen = gen_history[gen_history['Generation']==gen_history['Generation'].max()]
    #latest_gen.head()
    return float(latest_gen.head(1)['Metric 1'])


In [8]:

for SEED in SEED_LIST:
    print(eval(hyperparameters_range,SEED))

Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 133.97it/s]


Done in 7.467577695846558
0.224289330760265


Working with budget of 1000: 100%|██████████| 1000/1000 [00:08<00:00, 119.77it/s]


Done in 8.351029396057129
0.23945038904205695


Working with budget of 1000: 100%|██████████| 1000/1000 [00:08<00:00, 124.04it/s]

Done in 8.063612461090088
0.21551765683383306





In [9]:
import pandas as pd

def create_dataframe(hyperparameters, algorithm_output):
    # Append the algorithm output to the hyperparameters dictionary
    hyperparameters['Algorithm Output'] = algorithm_output

    # Create a pandas DataFrame from the dictionary
    # The keys of the dictionary become the column names, and the values become the row values
    df = pd.DataFrame(hyperparameters, index=[0])

    return df


In [10]:
hyperparameters_range = {
    'Learning Rate': 0.01,
    'Epochs': 100,
    'Regularization': True
}
algorithm_output = 0.85

df = create_dataframe(hyperparameters_range, algorithm_output)
print(df)


   Learning Rate  Epochs  Regularization  Algorithm Output
0           0.01     100            True              0.85


In [11]:
first_generation_fraction = [0.05, 0.10, 0.2]
crossover_ratio = [0, 0.1, 0.2, 0.4]
next_generation_fraction = [0.05, 0.10, 0.2]
initial_sample_fraction = [1.0, 2.0, 3.0]
minimal_next_generation_fraction = [0.0025, 0.01]
percent_best_fraction = [0.05, 0.1, 0.15, 0.2]

hyperparameters_range = {
    'first_generation_fraction': 0.05,
    'crossover': True,
    'crossover_ratio': 0.2,
    'next_generation_fraction': 0.05,
    'initial_sample_fraction': 1.5,
    'minimal_next_generation_fraction': 0.0025,
    'max_gens': 500,
    'percent_best_fraction': 0.1,
    'include_initial_pop': False
}

In [16]:
import random

hyperparameter_spaces = {
    'first_generation_fraction': [0.05, 0.10, 0.2],
    'crossover_ratio': [0, 0.1, 0.2, 0.4],
    'next_generation_fraction': [0.05, 0.10, 0.2],
    'initial_sample_fraction': [0.0, 1.0, 2.0, 3.0],
    'minimal_next_generation_fraction': [0.0025, 0.01],
    'percent_best_fraction': [0.05, 0.1, 0.15, 0.2]
}
mutation_ratio = 0.25

def generate_random_hyperparameters():
    return {k: random.choice(v) for k, v in hyperparameter_spaces.items()}

def crossover(hyperparameters1, hyperparameters2):
    return {k: random.choice([v1, v2]) for (k, v1), (_, v2) in zip(hyperparameters1.items(), hyperparameters2.items())}

def mutate(hyperparameters):
    parameter_to_mutate = random.choice(list(hyperparameters.keys()))
    hyperparameters[parameter_to_mutate] = random.choice(hyperparameter_spaces[parameter_to_mutate])
    return hyperparameters

def evolutionary_search(eval, population_size, num_generations):
    population = [generate_random_hyperparameters() for _ in range(population_size)]
    for _ in range(num_generations):
        fitnesses = [eval(h) for h in population]
        best_index = fitnesses.index(max(fitnesses))
        next_generation = [population[best_index]]
        while len(next_generation) < population_size:
            if random.random() < crossover_ratio:
                parent1, parent2 = random.choices(population, k=2)
                offspring = crossover(parent1, parent2)
            else:
                offspring = generate_random_hyperparameters()
            if random.random() < mutation_ratio:
                offspring = mutate(offspring)
            next_generation.append(offspring)
        population = next_generation
    best_hyperparameters = max(population, key=eval)
    return best_hyperparameters


In [17]:
evolutionary_search(eval,10,3)

Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 128.46it/s]


Done in 7.7846527099609375


Working with budget of 1000: 100%|██████████| 1000/1000 [00:08<00:00, 113.42it/s]


Done in 8.816608428955078


Working with budget of 1000: 100%|██████████| 1000/1000 [00:05<00:00, 178.47it/s]


Done in 5.603183269500732


Working with budget of 1000: 100%|██████████| 1000/1000 [00:12<00:00, 78.07it/s]


Done in 12.809284925460815


Working with budget of 1000: 100%|██████████| 1000/1000 [00:10<00:00, 92.53it/s]


Done in 10.807486772537231


Working with budget of 1000: 100%|██████████| 1000/1000 [00:09<00:00, 103.41it/s]


Done in 9.670337438583374


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 311.33it/s]


Done in 3.2199950218200684


Working with budget of 1000: 100%|██████████| 1000/1000 [00:06<00:00, 149.35it/s]


Done in 6.695791482925415


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 295.14it/s]


Done in 3.388165235519409


Working with budget of 1000: 100%|██████████| 1000/1000 [00:06<00:00, 158.70it/s]

Done in 6.309215307235718





TypeError: '<' not supported between instances of 'float' and 'list'

In [18]:
import numpy as np
import random

# Defining hyperparameters and their possible values
hyperparameters_range = {
    'first_generation_fraction' : [0.05, 0.10, 0.2],
    'crossover_ratio' : [0, 0.1, 0.2, 0.4],
    'next_generation_fraction' : [0.05, 0.10, 0.2],
    'initial_sample_fraction' : [1.0, 2.0, 3.0],
    'minimal_next_generation_fraction' : [0.0025, 0.01],
    'percent_best_fraction' : [0.05, 0.1, 0.15, 0.2]
}

# Defining parameters for the evolutionary algorithm
POPULATION_SIZE = 20
MUTATION_RATE = 0.1
NUM_GENERATIONS = 5

def median_eval(hyperparameters):
    evl = []
    for SEED in SEED_LIST:
        evl.append(eval(hyperparameters))
    return np.median(evl)

def create_individual(hyperparameters):
    return {hp: np.random.choice(values) for hp, values in hyperparameters.items()}

def create_population(hyperparameters, size):
    return [create_individual(hyperparameters) for _ in range(size)]

def select_parents(population):
    # Select two parents at random
    return random.sample(population, 2)

def crossover(parent1, parent2):
    # Single point crossover
    child = {}
    crossover_point = len(parent1) // 2
    for i, key in enumerate(parent1):
        child[key] = parent1[key] if i < crossover_point else parent2[key]
    return child

def mutate(individual, hyperparameters):
    # Randomly change one hyperparameter
    if np.random.rand() < MUTATION_RATE:
        mutation_param = np.random.choice(list(hyperparameters.keys()))
        individual[mutation_param] = np.random.choice(hyperparameters[mutation_param])
    return individual

# Initialize population
population = create_population(hyperparameters_range, POPULATION_SIZE)

# Main genetic algorithm loop
for _ in range(NUM_GENERATIONS):
    # Evaluate population and sort by fitness
    population = sorted(population, key=lambda x: median_eval(x), reverse=True)
    
    # Crossover and mutation
    next_generation = []
    while len(next_generation) < POPULATION_SIZE:
        parent1, parent2 = select_parents(population[:POPULATION_SIZE // 2])
        child = crossover(parent1, parent2)
        child = mutate(child, hyperparameters_range)
        next_generation.append(child)
    population = next_generation

# After all generations, the best individual is at the top of the sorted population
best_hyperparameters = population[0]
print(f'Best hyperparameters: {best_hyperparameters}')


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 253.90it/s]


Done in 3.9465279579162598


Working with budget of 1000: 100%|██████████| 1000/1000 [00:12<00:00, 78.90it/s]


Done in 12.674926280975342


Working with budget of 1000: 100%|██████████| 1000/1000 [00:11<00:00, 84.38it/s]


Done in 11.85953164100647


Working with budget of 1000: 100%|██████████| 1000/1000 [00:06<00:00, 150.44it/s]


Done in 6.647111892700195


Working with budget of 1000: 100%|██████████| 1000/1000 [00:02<00:00, 344.73it/s]


Done in 2.900822639465332


Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 141.86it/s]


Done in 7.048993110656738


Working with budget of 1000: 100%|██████████| 1000/1000 [00:04<00:00, 203.91it/s]


Done in 4.904177188873291


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 301.86it/s]


Done in 3.3208303451538086


Working with budget of 1000: 100%|██████████| 1000/1000 [00:05<00:00, 196.46it/s]


Done in 5.106179714202881


Working with budget of 1000: 100%|██████████| 1000/1000 [00:04<00:00, 209.05it/s]


Done in 4.7912843227386475


Working with budget of 1000: 100%|██████████| 1000/1000 [00:04<00:00, 230.14it/s]


Done in 4.345175743103027


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 296.41it/s]


Done in 3.3736579418182373


Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 137.41it/s]


Done in 7.277475118637085


Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 141.51it/s]


Done in 7.066479921340942


Working with budget of 1000: 100%|██████████| 1000/1000 [00:05<00:00, 169.95it/s]


Done in 5.883950710296631


Working with budget of 1000: 100%|██████████| 1000/1000 [00:06<00:00, 143.15it/s]


Done in 6.9856789112091064


Working with budget of 1000: 100%|██████████| 1000/1000 [00:10<00:00, 92.27it/s]


Done in 10.838303565979004


Working with budget of 1000: 100%|██████████| 1000/1000 [00:11<00:00, 90.20it/s]


Done in 11.086402893066406


Working with budget of 1000: 100%|██████████| 1000/1000 [00:15<00:00, 65.90it/s]


Done in 15.174853563308716


Working with budget of 1000: 100%|██████████| 1000/1000 [00:06<00:00, 150.66it/s]


Done in 6.637535810470581


Working with budget of 1000: 100%|██████████| 1000/1000 [00:07<00:00, 132.08it/s]


Done in 7.571267604827881


Working with budget of 1000: 100%|██████████| 1000/1000 [00:10<00:00, 97.40it/s]


Done in 10.26664137840271


Working with budget of 1000: 100%|██████████| 1000/1000 [00:11<00:00, 87.02it/s]


Done in 11.491799116134644


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 289.13it/s]


Done in 3.458672523498535


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 299.37it/s]


Done in 3.340378761291504


Working with budget of 1000: 100%|██████████| 1000/1000 [00:12<00:00, 81.32it/s]


Done in 12.305607080459595


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 306.84it/s]


Done in 3.2590415477752686


Working with budget of 1000: 100%|██████████| 1000/1000 [00:10<00:00, 92.95it/s]


Done in 10.759009599685669


Working with budget of 1000: 100%|██████████| 1000/1000 [00:09<00:00, 110.25it/s]


Done in 9.07060432434082


Working with budget of 1000: 100%|██████████| 1000/1000 [00:03<00:00, 306.21it/s]

Done in 3.2656939029693604
Best hyperparameters: {'first_generation_fraction': 0.1, 'crossover_ratio': 0.4, 'next_generation_fraction': 0.05, 'initial_sample_fraction': 1.0, 'minimal_next_generation_fraction': 0.0025, 'percent_best_fraction': 0.1}





In [22]:
hps = {'first_generation_fraction': 0.1, 'crossover_ratio': 0.4, 'next_generation_fraction': 0.05, 'initial_sample_fraction': 1.0, 'minimal_next_generation_fraction': 0.0025, 'percent_best_fraction': 0.1}
evl = []
for SEED in SEED_LIST:
    evl.append(eval(hps))
print(evl)

Working with budget of 1000: 100%|██████████| 1000/1000 [00:11<00:00, 83.76it/s]


Done in 11.938983917236328


Working with budget of 1000: 100%|██████████| 1000/1000 [00:10<00:00, 93.82it/s]


Done in 10.666263341903687


Working with budget of 1000: 100%|██████████| 1000/1000 [00:11<00:00, 90.64it/s]

Done in 11.034534931182861
[0.2195285199793807, 0.22884042849156705, 0.21212944742725728]





In [None]:
import numpy as np
import random

# Defining hyperparameters and their possible values
hyperparameters_range = {
    'first_generation_fraction' : [0.05, 0.10, 0.2],
    'crossover_ratio' : [0, 0.1, 0.2, 0.4],
    'next_generation_fraction' : [0.05, 0.10, 0.2],
    'initial_sample_fraction' : [1.0, 2.0, 3.0],
    'minimal_next_generation_fraction' : [0.0025, 0.01],
    'percent_best_fraction' : [0.05, 0.1, 0.15, 0.2]
}

# Defining parameters for the evolutionary algorithm
POPULATION_SIZE = 20
MUTATION_RATE = 0.1
NUM_GENERATIONS = 5
ELITE_SIZE = 2
TOURNAMENT_SIZE = 4

def median_eval(hyperparameters):
    evl = []
    for SEED in SEED_LIST:
        evl.append(eval(hyperparameters))
    return np.median(evl)

def create_individual(hyperparameters):
    return {hp: np.random.choice(values) for hp, values in hyperparameters.items()}

def create_population(hyperparameters, size):
    return [create_individual(hyperparameters) for _ in range(size)]

def tournament_selection(population, tournament_size):
    # Select tournament_size individuals and return the best
    return max(random.sample(population, tournament_size), key=median_eval)

def crossover(parent1, parent2):
    # Multiple points crossover
    child = {}
    for key in parent1:
        child[key] = np.random.choice([parent1[key], parent2[key]])
    return child

def mutate(individual, hyperparameters):
    # Randomly change one hyperparameter
    if np.random.rand() < MUTATION_RATE:
        mutation_param = np.random.choice(list(hyperparameters.keys()))
        individual[mutation_param] = np.random.choice(hyperparameters[mutation_param])
    return individual

# Initialize population
population = create_population(hyperparameters_range, POPULATION_SIZE)

# Main genetic algorithm loop
for _ in range(NUM_GENERATIONS):
    # Evaluate population and sort by fitness
    population = sorted(population, key=lambda x: median_eval(x), reverse=True)

    # Elitism: carry the best individuals to the next generation
    next_generation = population[:ELITE_SIZE]

    # Crossover and mutation
    while len(next_generation) < POPULATION_SIZE:
        parent1 = tournament_selection(population, TOURNAMENT_SIZE)
        parent2 = tournament_selection(population, TOURNAMENT_SIZE)
        child = crossover(parent1, parent2)
        child = mutate(child, hyperparameters_range)
        next_generation.append(child)
    population = next_generation

# After all generations, the best individual is at the top of the sorted population
best_hyperparameters = population[0]
print(f'Best hyperparameters: {best_hyperparameters}')
