## Item

In [11]:
class Item:
    #property_list = ['qed', 'logp', 'donor']

    def __init__(self, value, property_list):
        self.value = value
        self.property_list = property_list if property_list is not None else self.property_list
        # raw scores are the original objective values
        self.raw_scores = [ 0 for prop in self.property_list]
        # scores are the objective values (after judgement) for MOO
        self.scores = [ 0 for prop in self.property_list]

## eval

In [None]:
def judge_donor(  requirement, simi_requirement, similarity, donor_input_mol, donor_response_mol, delta= 1e-9 ):
    if requirement == 'increase':
        return (donor_response_mol > donor_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (donor_response_mol < donor_input_mol - delta  and similarity>simi_requirement)
    if requirement == 'the same':        
        return (abs(donor_input_mol - donor_response_mol) < delta and similarity>simi_requirement)
    if requirement == 'increase, >=2':
        return (donor_response_mol - donor_input_mol >= 2 and similarity>simi_requirement)
    if requirement == 'decrease, >=2':
        return (donor_input_mol - donor_response_mol>=2 and similarity>simi_requirement)
        
    raise ValueError(f'Invalid requirement: {requirement}')

def judge_qed(  requirement, simi_requirement, similarity, qed_input_mol, qed_response_mol, delta=1e-9 ):
    if abs(qed_response_mol) == 0:
        return False
    if requirement == 'increase':
        return (qed_response_mol > qed_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (qed_response_mol < qed_input_mol - delta and similarity>simi_requirement)
    if requirement == 'increase, >=0.1':
        return ( qed_response_mol - qed_input_mol >= 0.1 and similarity>simi_requirement)
    if requirement == 'decrease, >=0.1':
        return ( qed_input_mol - qed_response_mol >= 0.1 and similarity>simi_requirement)
    raise ValueError(f'Invalid requirement: {requirement}')

def judge_logp(  requirement, simi_requirement, similarity, logp_input_mol, logp_response_mol, delta=1e-9 ):
    if abs(logp_response_mol) == 100:
        return False
    if requirement == 'increase':
        return (logp_response_mol > logp_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (logp_response_mol < logp_input_mol - delta and similarity>simi_requirement)
    if 'range,' in requirement:
        a, b = [int(x) for x in requirement.split(',')[1:]]
        return (a<=logp_response_mol<=b and similarity>simi_requirement)
    raise ValueError(f'Invalid requirement: {requirement}')

import numpy as np
from tqdm import tqdm
import requests
import json

# 定义 API 端点 URL
url = 'http://cpu1.ms.wyue.site:8000/process'


def get_evaluation(evaluate_metric, smiles):
    data = {
        "ops": evaluate_metric,
        "data":smiles
    }
    response = requests.post(url, json=data)
    result = response.json()['results']
    return result

def mean_sr(r):
    return r.mean(), (r>0).sum()/len(r)
def eval_mo_results(df,similarity_requ=0.4,length=99999,ops=['qed','logp','donor'],qed_meta=None,logp_meta=None,donor_meta=None,candidate_num=20):
    #key = 'mo_gpt4_direct_wo_history'
    #similarity_requ = 0.4
    hist_success_times = []
    for index,row in tqdm(df[:length].iterrows()):
        mol = row['input_mol']
        #print(mol)
        combine_mols = [[mol,row[f'response{i+1}']] for i in range(candidate_num)]
        eval_res = get_evaluation(['similarity']+ops,combine_mols)
        #print(eval_res)
        if 'donor' in ops:
            input_mol_donor = eval_res['donor'][0][0]
        if 'qed' in ops:
            input_mol_qed = eval_res['qed'][0][0]
        if 'logp' in ops:
            input_mol_logp = eval_res['logp'][0][0]
        success_times = 0
        for i in range(candidate_num):
            if len(ops) == 3:
                if judge_donor(donor_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_donor,eval_res['donor'][i][1]) and \
                    judge_logp(logp_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_logp,eval_res['logp'][i][1]) and \
                    judge_qed(qed_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_qed,eval_res['qed'][i][1]):
                    success_times += 1
            elif len(ops)==1 and ops[0]=='qed' and judge_qed(qed_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_qed,eval_res['qed'][i][1]):
                success_times+=1
            elif len(ops)==1 and ops[0]=='logp' and judge_logp(logp_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_logp,eval_res['logp'][i][1]):
                success_times+=1
            elif len(ops)==1 and ops[0]=='donor' and judge_donor(donor_meta[index]['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_donor,eval_res['donor'][i][1]):
                success_times+=1
        #print('success times:',success_times)
        hist_success_times.append(success_times)
    return np.array(hist_success_times)





In [3]:
len(obj['final_pops'])

150

In [5]:
import json
with open('/home/v-nianran/src/MOLLM/data/uniform150_test.json', 'r') as json_file:
    dataset= json.load(json_file)
dataset['requirements'][:2]

[{'qed_requ': {'source_smiles': 'O=C([C@H]1CCCC[C@H]1N1CCCC1=O)N1CC2(CC(F)C2)C1',
   'reference_smiles': 'NNC(=O)C(=O)NC1CC2(C1)CN(C(=O)[C@H]1CCCC[C@H]1N1CCCC1=O)C2',
   'property': 'QED',
   'requirement': 'decrease'},
  'logp_requ': {'source_smiles': 'CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1',
   'reference_smiles': 'COc1ccc([C@@H](O)C(=O)N[C@H]2[C@@H]3COC[C@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1',
   'property': 'logP',
   'requirement': 'range, 2, 3'},
  'donor_requ': {'source_smiles': 'O=C(NC[C@H]1CCOc2ccccc21)c1ccc(F)c(C(F)(F)F)c1',
   'reference_smiles': 'CC(C)C[NH+](CC(=O)[O-])C(F)(F)c1cc(C(=O)NC[C@H]2CCOc3ccccc32)ccc1F',
   'property': 'donor',
   'requirement': 'increase'}},
 {'qed_requ': {'source_smiles': 'CC(C)c1cccc(N(C)CCN(C)C(=O)CCNc2cc(-c3ccsc3)cc(-c3cccc(O)c3)n2)c1',
   'reference_smiles': 'CN(C)CCN(C)C(=O)CCNc1cc(-c2ccsc2)cc(-c2cccc(O)c2)n1',
   'property': 'QED',
   'requirement': 'increase, >=0.1'},
  'logp_requ': {'source_smiles'

In [28]:
def judge_donor(  requirement, simi_requirement, similarity, donor_input_mol, donor_response_mol, delta= 1e-9 ):
    if requirement == 'increase':
        return (donor_response_mol > donor_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (donor_response_mol < donor_input_mol - delta  and similarity>simi_requirement)
    if requirement == 'the same':        
        return (abs(donor_input_mol - donor_response_mol) < delta and similarity>simi_requirement)
    if requirement == 'increase, >=2':
        return (donor_response_mol - donor_input_mol >= 2 and similarity>simi_requirement)
    if requirement == 'decrease, >=2':
        return (donor_input_mol - donor_response_mol>=2 and similarity>simi_requirement)
        
    raise ValueError(f'Invalid requirement: {requirement}')

def judge_qed(  requirement, simi_requirement, similarity, qed_input_mol, qed_response_mol, delta=1e-9 ):
    if abs(qed_response_mol) == 0:
        return False
    if requirement == 'increase':
        return (qed_response_mol > qed_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (qed_response_mol < qed_input_mol - delta and similarity>simi_requirement)
    if requirement == 'increase, >=0.1':
        return ( qed_response_mol - qed_input_mol >= 0.1 and similarity>simi_requirement)
    if requirement == 'decrease, >=0.1':
        return ( qed_input_mol - qed_response_mol >= 0.1 and similarity>simi_requirement)
    raise ValueError(f'Invalid requirement: {requirement}')

def judge_logp(  requirement, simi_requirement, similarity, logp_input_mol, logp_response_mol, delta=1e-9 ):
    if abs(logp_response_mol) == 100:
        return False
    if requirement == 'increase':
        return (logp_response_mol > logp_input_mol + delta and similarity>simi_requirement)
    if requirement == 'decrease':
        return (logp_response_mol < logp_input_mol - delta and similarity>simi_requirement)
    if 'range,' in requirement:
        a, b = [int(x) for x in requirement.split(',')[1:]]
        return (a<=logp_response_mol<=b and similarity>simi_requirement)
    raise ValueError(f'Invalid requirement: {requirement}')

In [34]:
import re
from tqdm import tqdm
import requests
import json
import numpy as np
url = 'http://cpu1.ms.wyue.site:8000/process'
import time

def get_evaluation(evaluate_metric, smiles):
    data = {
        "ops": evaluate_metric,
        "data":smiles
    }
    response = requests.post(url, json=data)
    result = response.json()['results']
    return result

def extract_smiles_from_string(text):
    pattern = r"<mol>(.*?)</mol>"
    smiles_list = re.findall(pattern, text)
    return smiles_list
def eval_mo_results(dataset,obj,similarity_requ=0.4,ops=['qed','logp','donor'],candidate_num=20):
    hist_success_times = []
    prompts = dataset['prompts']
    requs = dataset['requirements']
    for index,prompt in tqdm(enumerate(prompts)):
        mol = extract_smiles_from_string(prompt)[0]
        #print(mol)
        final_pops = obj['final_pops'][index]
        combine_mols = [[mol, i.value] for i in final_pops]
        eval_res = get_evaluation(['similarity']+ops,combine_mols)
        #print(eval_res)
        if 'donor' in ops:
            input_mol_donor = eval_res['donor'][0][0]
        if 'qed' in ops:
            input_mol_qed = eval_res['qed'][0][0]
        if 'logp' in ops:
            input_mol_logp = eval_res['logp'][0][0]
        success_times = 0
        for i in range(candidate_num):
            if len(ops) == 3:
                if judge_donor(requs[index]['donor_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_donor,eval_res['donor'][i][1]) and \
                    judge_logp(requs[index]['logp_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_logp,eval_res['logp'][i][1]) and \
                    judge_qed(requs[index]['qed_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_qed,eval_res['qed'][i][1]):
                    success_times += 1
            elif len(ops)==1 and ops[0]=='qed' and judge_qed(requs[index]['qed_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_qed,eval_res['qed'][i][1]):
                success_times+=1
            elif len(ops)==1 and ops[0]=='logp' and judge_logp(requs[index]['logp_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_logp,eval_res['logp'][i][1]):
                success_times+=1
            elif len(ops)==1 and ops[0]=='donor' and judge_donor(requs[index]['donor_requ']['requirement'],similarity_requ,eval_res['similarity'][i],input_mol_donor,eval_res['donor'][i][1]):
                success_times+=1
        #print('success times:',success_times)
        hist_success_times.append(success_times)
    return np.array(hist_success_times)

In [35]:
import pickle
filepath = '/home/v-nianran/src/results/donoruniform150.pkl'
with open(filepath, 'rb') as f:
    obj = pickle.load(f)
r = eval_mo_results(dataset,obj,similarity_requ=0.4,ops=['donor'],candidate_num=20)

150it [00:57,  2.59it/s]


In [43]:
exps = [
    ['qed'],
    ['logp'],
    ['donor'],
    ['qed','logp'],
    ['qed','donor'],
    ['logp','donor'],
    ['qed','logp','donor']
]
rs = []
for e in exps:
    filepath = '/home/v-nianran/src/results/' + '_'.join(e) + 'uniform150.pkl'
    with open(filepath, 'rb') as f:
        obj = pickle.load(f)
    r = eval_mo_results(dataset,obj,similarity_requ=0.4,ops=e,candidate_num=20)
    print(e,mean_sr(r))
    rs.append(r)

150it [01:03,  2.36it/s]


['qed'] (1.34, 0.3333333333333333)


150it [01:05,  2.28it/s]


['logp'] (1.8066666666666666, 0.38666666666666666)


150it [00:58,  2.59it/s]


['donor'] (2.513333333333333, 0.9133333333333333)


150it [01:11,  2.10it/s]

['qed', 'logp'] (0.0, 0.0)





FileNotFoundError: [Errno 2] No such file or directory: '/home/v-nianran/src/results/qed_donoruniform150.pkl'

In [44]:
exps = [
    ['logp','donor'],
    ['qed','logp','donor']
]
for e in exps:
    filepath = '/home/v-nianran/src/results/' + '_'.join(e) + 'uniform150.pkl'
    with open(filepath, 'rb') as f:
        obj = pickle.load(f)
    r = eval_mo_results(dataset,obj,similarity_requ=0.4,ops=e,candidate_num=20)
    print(e,mean_sr(r))
    rs.append(r)

150it [01:06,  2.24it/s]


['logp', 'donor'] (0.0, 0.0)


150it [01:11,  2.10it/s]

['qed', 'logp', 'donor'] (0.7333333333333333, 0.25333333333333335)





In [None]:
filepath = '/home/v-nianran/src/results/donoruniform150.pkl'
with open(filepath, 'rb') as f:
    obj = pickle.load(f)
r = eval_mo_results(dataset,obj,similarity_requ=0.4,ops=['donor'],candidate_num=20)

In [42]:
def mean_sr(r):
    k = r.clip(0,5)
    return k.mean(), (k>0).sum()/len(k)
mean_sr(r)

(2.513333333333333, 0.9133333333333333)

In [None]:
import matplotlib.pyplot as plt

'''# Initial population data
initial_qed = [i.fitness[0] for i in init_population]
initial_sa = [i.logp for i in init_population]
initial_donor = [i.donor_num for i in init_population]

# Plot initial population
plt.scatter(initial_qed, initial_sa, color='blue', label='Initial Population')

# Plot initial population points with donor_num > 2 using a star marker
for i in range(len(init_population)):
    if initial_donor[i] > 2:
        plt.scatter(initial_qed[i], initial_sa[i], color='yellow', marker='*')'''

# Final population data
population = final_pops[0]
final_qed = [i.fitness[0] for i in population]
final_sa = [i.logp for i in population]
final_donor = [i.donor_num for i in population]

# Plot final population
plt.scatter(final_qed, final_sa, color='red', label='Final Population')

# Plot final population points with donor_num > 2 using a star marker
for i in range(len(population)):
    if final_donor[i] > 2:
        plt.scatter(final_qed[i], final_sa[i], color='yellow', marker='*')

# Labels and legend
plt.xlabel('QED')
plt.ylabel('Normalized SA')
plt.legend()
plt.title('Initial and Final Populations in QED vs. Normalized SA (without LLM, random generating new candidates)')
plt.show()

## Rewarding system

In [12]:
from eval import get_evaluation
class Rewarding_system:
    def __init__(self):
        self.all_rewards = {}

    def register_reward(self, reward_name, reward_function):
        self.all_rewards[reward_name] = reward_function

    def get_reward(self, reward_name, items):
        return self.all_rewards[reward_name](*items)

    def evaluate(self,ops,smiles_list):
        return get_evaluation(ops,smiles_list)

    def evaluate_items(self, items, qed_requ, logp_requ, donor_requ, donor_num):
        smiles_list = [[item.value] for item in items]
        fitnesses, donors, logps, qeds = self._get_evaluation(smiles_list, qed_requ, logp_requ, donor_requ, donor_num)

        for i, item in enumerate(items):
            item.scores = {'qed': qeds[i], 'logp': logps[i], 'donor': donors[i]}
            item.fitness = fitnesses[i]

    def _get_evaluation(self, smiles_list, qed_requ, logp_requ, donor_requ, donor_num):
        # 具体的评估函数与之前的类似
        pass

## History

In [13]:
import pickle
import os
class History_Buffer:
    def __init__(self):
        self.prompts = []
        self.generations = []
        self.responses = []
        self.save_path = 'checkpoint/'

    def save_to_pkl(self, filename):
        with open(os.path.join(self.save_path,filename), 'wb') as f:
            pickle.dump(self, f)
        print(f"Data saved to {filename}")

    def load_from_pkl(self,filename):
        with open(os.path.join(self.save_path,filename), 'rb') as f:
            obj = pickle.load(f)
        print(f"Data loaded from {filename}")
        return obj

    def push(self,prompts,generation,responses):
        self.prompts.append(prompts)
        self.generations.append(generation)
        self.responses.append(responses)

## LLM

In [14]:
import os
from openai import AzureOpenAI
from azure.identity import AzureCliCredential, ChainedTokenCredential, DefaultAzureCredential, get_bearer_token_provider
from openai import AzureOpenAI
class LLM:
    def __init__(self,model='chatgpt'):
        self.model_choice = model
        self.model = self._init_model(model)
        self.chat = self._init_chat(model)

    def _init_chat(self,model):
        if model == 'chatgpt':
            return self.gpt_chat

    def _init_model(self,model):
        if model == 'chatgpt':
            return self._init_chatgpt()

    def _init_chatgpt(self):
        # Set the necessary variables
        resource_name = "gcrgpt4aoai2c"
        endpoint = f"https://{resource_name}.openai.azure.com/"
        api_version = "2024-02-15-preview"  # Replace with the appropriate API version

        azure_credential = ChainedTokenCredential(
            AzureCliCredential(),
            DefaultAzureCredential(
                exclude_cli_credential=True,
                # Exclude other credentials we are not interested in.
                exclude_environment_credential=True,
                exclude_shared_token_cache_credential=True,
                exclude_developer_cli_credential=True,
                exclude_powershell_credential=True,
                exclude_interactive_browser_credential=True,
                exclude_visual_studio_code_credentials=True,
                managed_identity_client_id=os.environ.get("DEFAULT_IDENTITY_CLIENT_ID"),
            )
        )

        token_provider = get_bearer_token_provider(azure_credential,
            "https://cognitiveservices.azure.com/.default")
        client = AzureOpenAI(
            api_version=api_version,
            azure_endpoint=endpoint,
            azure_ad_token_provider=token_provider
        )
        
        
        return client
    
    def gpt_chat(self, content):
        completion = self.model.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "user",
                    "content": content,
                },
            ],
        )
        res = completion.choices[0].message.content
        return res
llm = LLM()

In [15]:
llm.chat('yes')

'Hello! How can I assist you today?'

## MOO

In [16]:
def generate_sentence(original_mol,requirements,properties):
    sentences = []
    for property in properties:
        value = requirements[property+'_requ']
        property_name = value["property"]
        source_smiles = value["source_smiles"]
        requirement = value["requirement"]

        # Check for specific requirement patterns directly using symbols
        if "increase" in requirement:
            if ">=" in requirement:
                threshold = requirement.split(">=")[-1].strip()
                sentence = f"help me increase the {property_name} value by at least {threshold}."
            elif ">" in requirement:
                threshold = requirement.split(">")[-1].strip()
                sentence = f"help me increase the {property_name} value to more than {threshold}."
            else:
                sentence = f"help me increase the {property_name} value."

        elif "decrease" in requirement:
            if "<=" in requirement:
                threshold = requirement.split("<=")[-1].strip()
                sentence = f"help me decrease the {property_name} value to at most {threshold}."
            elif "<" in requirement:
                threshold = requirement.split("<")[-1].strip()
                sentence = f"help me decrease the {property_name} value to less than {threshold}."
            else:
                sentence = f"help me decrease the {property_name} value."

        elif "range" in requirement:
            # Extract the range values from the string
            range_values = requirement.split(",")[1:]
            range_start = range_values[0].strip()
            range_end = range_values[1].strip()
            sentence = f"help me keep the {property_name} value within the range {range_start} to {range_end}."

        elif "the same" in requirement:
            sentence = f"help me keep the {property_name} value the same."

        elif any(op in requirement for op in [">=", "<=", "=", ">", "<"]):
            # Directly use the symbols for constraints
            sentence = f"help me ensure the {property_name} value is {requirement}."
            
        else:
            sentence = f"help me modify the {property_name} value."
        sentences.append(sentence)
    sentences = f'Suggest new molecules based on molecule <mol>{original_mol}</mol> and ' + 'and '.join(sentences) 
    return sentences

# Example metadata
metadata = {
    "qed_requ": {
        "source_smiles": "O=C([C@H]1CCCC[C@H]1N1CCCC1=O)N1CC2(CC(F)C2)C1",
        "reference_smiles": "NNC(=O)C(=O)NC1CC2(C1)CN(C(=O)[C@H]1CCCC[C@H]1N1CCCC1=O)C2",
        "property": "QED",
        "requirement": "decrease <=2"
    },
    "logp_requ": {
        "source_smiles": "CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1",
        "reference_smiles": "COc1ccc([C@@H](O)C(=O)N[C@H]2[C@@H]3COC[C@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1",
        "property": "logP",
        "requirement": "range, 2, 3"
    },
    "donor_requ": {
        "source_smiles": "O=C(NC[C@H]1CCOc2ccccc21)c1ccc(F)c(C(F)(F)F)c1",
        "reference_smiles": "CC(C)C[NH+](CC(=O)[O-])C(F)(F)c1cc(C(=O)NC[C@H]2CCOc3ccccc32)ccc1F",
        "property": "donor",
        "requirement": "increase >= 1"
    },
}

# Generate sentences based on metadata
generate_sentence('CCH',metadata,['qed','donor'])

'Suggest new molecules based on molecule <mol>CCH</mol> and help me decrease the QED value to at most 2.and help me increase the donor value by at least 1.'

In [22]:
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures
import re
import copy
from tqdm import tqdm
# Set your OpenAI API key
import random
import torch
from functools import partial
import os
from openai import AzureOpenAI
from tdc.generation import MolGen

from eval import get_evaluation
import time
from model.util import nsga2_selection,so_selection
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    os.environ['PYTHONHASHSEED'] = str(seed)
    
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def extract_smiles_from_string(text):
    pattern = r"<mol>(.*?)</mol>"
    smiles_list = re.findall(pattern, text)
    return smiles_list

def split_list(lst, n):
    """Splits the list lst into n nearly equal parts."""
    k, m = divmod(len(lst), n)
    return [lst[i*k + min(i, m):(i+1)*k + min(i+1, m)] for i in range(n)]

class MOO:
    def __init__(self, reward_system, llm,property_list,config):
        self.reward_system = reward_system
        self.config = config
        self.llm = llm
        self.history = History_Buffer()
        self.property_list = property_list
        self.moles_df = None
        self.pop_size = self.config.get('optimization.pop_size')
        
    def init_mol_dataset(self):
        print('Loading ZINC dataset...')
        data = MolGen(name='ZINC')
        split = data.get_split()
        self.moles_df = split['train']

    def generate_initial_population(self, mol1, n):
        num_blocks = 200
        combs = [[mol1, mol2] for mol2 in self.moles_df.smiles]
        combs_blocks = split_list(combs, num_blocks)

        with concurrent.futures.ThreadPoolExecutor() as executor:
            futures = [executor.submit(get_evaluation, ['similarity'], block) for block in combs_blocks]
            results = [future.result() for future in futures]

        combined_results = []
        for result in results:
            combined_results.extend(result['similarity'])

        self.moles_df['similarity'] = combined_results
        top_n = self.moles_df.nlargest(n - 1, 'similarity').smiles.values.tolist()
        top_n.append(mol1)
        return [Item(i,self.property_list) for i in top_n]

    def crossover(self, parent_list):
        pop_content = self._prepare_pop_content(parent_list)
        prompt = generate_sentence(self.original_mol,self.requirement_meta,self.property_list)

        prompt = (
        prompt + 
        "I have some molecules with their objective values. "
        + pop_content +
        " Give me two new molecules that are different from all points above, and not dominated by any of the above. "
        "You can do it by applying crossover on the points I give to you. "
        f"Please note when you try to achieving these objectives, the molecules given you propose should be similar to the original molecule <mol>{self.original_mol}</mol>. "
        "Do not write code. Do not give any explanation. Each output new molecule must start with <mol> and end with </mol> in SIMLE form"
        )
        
        response = self.llm.chat(prompt)
        new_smiles = extract_smiles_from_string(response)
        return [Item(smile,self.property_list) for smile in new_smiles],prompt,response

    def _prepare_pop_content(self, ind_list):
        pop_content = ""
        for ind in ind_list:
            pop_content += f"<mol>{ind.value}</mol>,"
            for index,property in enumerate(ind.property_list):
                pop_content += f"{property}:{ind.raw_scores[index]},  "
            pop_content += '\n'
        return pop_content

    def evaluate(self, smiles_list):
        ops = self.property_list
        res = self.reward_system.evaluate(ops,smiles_list)
        results = np.zeros([len(smiles_list),len(ops)])
        raw_results = np.zeros([len(smiles_list),len(ops)])
        for i,op in enumerate(ops):
            part_res = res[op]
            for j,inst in enumerate(part_res):
                inst = inst[0]
                raw_results[j,i] = inst
                if op=='qed':
                    if 'increase' in self.requirement_meta['qed_requ']['requirement']:
                        results[j,i] = -inst
                    else:
                        results[j,i] = inst
                elif op=='logp':
                    logp_requ = self.requirement_meta['logp_requ']['requirement']
                    if 'increase' in logp_requ:
                        results[j,i] = -inst / 10
                    elif 'range' in logp_requ:
                        a, b = [int(x) for x in logp_requ.split(',')[1:]]
                        mid = (b+a)/2
                        results[j,i] = np.clip(abs(inst-mid) * 1/ ( (b-a)/2), a_min=0,a_max=1)    # 2-3    2.5  3-2.5=0.5 * 2     1 0
                        #if a<=inst<=b:
                        #    results[j,i] = 0
                        #else:
                        #    results[j,i] = 1     
                    else:
                        results[j,i] = inst/10
                elif op=='donor':
                    #print('donor num',inst,donor_num,j,i)
                    donor_requ = self.requirement_meta['donor_requ']['requirement']
                    donor_num = self.requirement_meta['donor_num']
                    if donor_requ == 'increase' and inst - donor_num>0:
                        results[j,i] = 0
                    elif donor_requ == 'decrease' and donor_num - inst>0:
                        results[j,i] = 0
                    elif donor_requ == 'same' and donor_num == inst:
                        results[j,i] = 0
                    elif donor_requ == 'increase, >=2' and inst - donor_num>=2:
                        results[j,i] = 0
                    else:
                        results[j,i] = 1  
                else:
                    raise NotImplementedError
        return results,raw_results

    def evaluate_all(self,items):
        smiles_list = [i.value for i in items]
        smiles_list = [[i,smiles_list[0]] for i in smiles_list]
        fitnesses,raw_results = self.evaluate(smiles_list)
        for i,ind in enumerate(items):
            ind.scores = fitnesses[i]
            ind.raw_scores = raw_results[i]

    def run(self, prompt,requirements):
        """High level logic"""
        self.requirement_meta = requirements
        ngen= self.config.get('optimization.ngen')
        

        self.init_mol_dataset()
        
        #initialization 
        mol = extract_smiles_from_string(prompt)[0]
        self.original_mol = mol

        population = self.generate_initial_population(mol1=mol, n=self.pop_size)
        donor_num = get_evaluation(['donor'], [[mol, mol]])['donor'][0][0]
        self.requirement_meta['donor_num'] = donor_num
        self.evaluate_all(population)

        init_pops = copy.deepcopy(population)

        #offspring_times = self.config.get('optimization.eval_budge') // ngen //2
        offspring_times = self.pop_size //2
        for gen in tqdm(range(ngen)):
            offspring = self.generate_offspring(population, prompt, offspring_times)
            population = self.select_next_population(population, offspring, self.pop_size)
        return init_pops,population

    def generate_offspring(self, population, prompt, offspring_times):
        offspring = []
        #for _ in range(offspring_times): # 20 10 crossver+mutation 20 
        parents = [random.sample(population, 2) for i in range(offspring_times)]
        while True:
            try:
                with concurrent.futures.ThreadPoolExecutor() as executor:
                    futures = [executor.submit(self.crossover, parent_list=parent_list) for parent_list in parents]
                    results = [future.result() for future in futures]
                    children, prompts, responses = zip(*results) #[[item,item],[item,item]] # ['who are you value 1', 'who are you value 2'] # ['yes, 'no']
                    break
            except Exception as e:
                print('retry in 60s, exception ',e)
                time.sleep(90)
        #parents = random.sample(population, 2)
        #children,prompt,response = self.crossover(prompt, parents) 
        for child_pair in children:
            self.evaluate_all(child_pair)
            offspring.extend(child_pair)
        self.history.push(prompts,children,responses) 
        return offspring

    def select_next_population(self, population, offspring, pop_size):
        combined_population = population + offspring
        if len(self.property_list)>1:
            return nsga2_selection(combined_population, pop_size)
        else:
            return so_selection(combined_population, pop_size)
c,p,r = None,None,None

## MOLLM

In [23]:
import yaml
import json
class ConfigLoader:
    def __init__(self, config_path="config.yaml"):
        self.config = self._load_config(config_path)

    def _load_config(self, config_path):
        with open(config_path, 'r') as file:
            config = yaml.safe_load(file)
        return config

    def get(self, key, default=None):
        keys = key.split('.')
        value = self.config
        for k in keys:
            value = value.get(k, {})
        if value == {}:
            return default
        return value

config_loader = ConfigLoader("config/base.yaml")

pop_size = config_loader.get("optimization.pop_size")
eval_budget = config_loader.get("optimization.eval_budget")
ngen = config_loader.get("optimization.ngen")
qed_requ = config_loader.get("optimization.qed_requ")
logp_requ = config_loader.get("optimization.logp_requ")
donor_requ = config_loader.get("optimization.donor_requ")
model_name = config_loader.get("model.name")

print(f"Population size: {pop_size}, Evaluation budget: {eval_budget}")

class MOLLM:
    def __init__(self, config='config/base.yaml'):
        self.config = ConfigLoader(config)
        self.property_list = self.config.get('goals')
        self.reward_system = Rewarding_system()
        self.llm = LLM()
        self.history = []
        self.load_dataset()
        self.init_pops = []
        self.final_pops = []
        self.save_dir = self.config.get('save_dir')
        self.save_suffix = self.config.get('save_suffix')
    
    def load_dataset(self):
        with open(self.config.get('dataset.path'), 'r') as json_file:
            self.dataset= json.load(json_file)
    
    def run(self):
        for i in range(len(self.dataset['prompts'])):
            moo = MOO(self.reward_system, self.llm,self.property_list,self.config)
            init_pops,final_pops = moo.run(self.dataset['prompts'][i], self.dataset['requirements'][i])
            self.history.append(moo.history)
            self.final_pops.append(final_pops)
            self.init_pops.append(init_pops)
            self.save_to_pkl(os.path.join(self.save_dir,'_'.join(self.property_list) + self.save_suffix +'.pkl'))

    def save_to_pkl(self, filepath):
        data = {
            'history':self.history,
            'init_pops':self.init_pops,
            'final_pops':self.final_pops
        }
        with open(filepath, 'wb') as f:
            pickle.dump(data, f)
        print(f"Data saved to {filepath}")

    def load_from_pkl(self,filepath):
        with open(filepath, 'rb') as f:
            obj = pickle.load(f)
        print(f"Data loaded from {filepath}")
        return obj
        

Population size: 4, Evaluation budget: 200


## Test and Tutorial

In [20]:
mollm = MOLLM()

In [32]:
## test initialization
mol = 'CCH'
mollm.moo.init_mol_dataset()
init_pops = mollm.moo.generate_initial_population(mol,10)
for i in init_pops:
    print(i.value)

Found local copy...
Loading...
Done!


Loading ZINC dataset...
CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1
N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)cc2)cc1
CC[NH+](CC)[C@](C)(CC)[C@H](O)c1cscc1Br
O=C(Nc1nc[nH]n1)c1cccnc1Nc1cccc(F)c1
Cc1c(/C=N/c2cc(Br)ccn2)c(O)n2c(nc3ccccc32)c1C#N
C[C@@H]1CN(C(=O)c2cc(Br)cn2C)CC[C@H]1[NH3+]
CCOc1ccc(OCC)c([C@H]2C(C#N)=C(N)N(c3ccccc3C(F)(F)F)C3=C2C(=O)CCC3)c1
Cc1ccccc1C(=O)N1CCC2(CC1)C[C@H](c1ccccc1)C(=O)N2C
CCCc1cc(NC(=O)CN2C(=O)NC3(CCC(C)CC3)C2=O)n(C)n1
CCH


In [33]:
## test evaluation in MOO
mollm.moo.requirement_meta['donor_num'] = 1
mollm.moo.evaluate_all(init_pops)

In [34]:
## test generation offspring
prompt = 'How can we modify the molecule  to decrease its logP value? and still for this molecule Consider a molecule with the SMILES string . Propose changes that could increase its QED value by at least 0.1 compared to the pre-optimized value to make it more drug-like. and still for this molecule  Support me in transforming the molecule  by incorporating the same hydrogen bond donors. This molecule is <mol>CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)CCl)C1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12</mol>'
offs = mollm.moo.generate_offspring(init_pops, prompt, 2)
offs

[<__main__.Item at 0x703ac2c1e250>,
 <__main__.Item at 0x7039c42151f0>,
 <__main__.Item at 0x7039c3f5c880>,
 <__main__.Item at 0x703b14d9db80>]

In [35]:
for i in offs:
    print(i.value)

CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)C(Cl)CCl)C1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12
CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)C(O)N1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12
CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)C(=O))C1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12
CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(OH)CCl)C1)Nc1nc(-c2ccc(C=N)c(F)c2)cn2ccnc12


In [37]:
## test selection next generation
next_gen = mollm.moo.select_next_population(init_pops, offs, 3)
for i in next_gen:
    print(i.scores)

[-0.82714972]
[-0.82237984]
[-0.81871365]


In [None]:
## overall 
mollm.moo.run

In [1]:
# load checkpoint
import pickle
with open('/home/v-nianran/src/results/qeduniform150.pkl','rb') as f:
    file = pickle.load(f)
file.keys()

dict_keys(['history', 'init_pops', 'final_pops'])

In [6]:
for i in range(2,5):
    print(i)

2
3
4


In [3]:
from tdc import Oracle
oracle = Oracle(name = 'LogP')

oracle(['c1ccccc1','CCH'])


[2.095171701744179, 0.0]

## make dataset

In [15]:
import pandas as pd
donor_df = pd.read_csv('/home/v-nianran/src/data/dpo_donor_test.csv')
qed_df = pd.read_csv('/home/v-nianran/src/data/dpo_qed_test.csv')
logp_df = pd.read_csv('/home/v-nianran/src/data/dpo_logp_test.csv')

def extract_smiles_from_string(text):
    pattern = r"<mol>(.*?)</mol>"
    smiles_list = re.findall(pattern, text)
    return smiles_list
num = 50
df = pd.concat([logp_df.iloc[:num],qed_df.iloc[num:num*2],donor_df.iloc[num*2:num*3]])
moles = [extract_smiles_from_string(p)[0] for p in df.prompt]
moles

['CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1',
 'CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)CCl)C1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12',
 'CCOc1cc(C)ccc1Cn1c(-c2cccc(C)n2)nnc1N1CCN(S(C)(=O)=O)[C@@H](C)C1',
 'CCC1Cc2c(nn(-c3ccc4ccccc4c3)c2-c2cc(Cl)cc(Cl)c2)C(CC)N1C(=O)c1cccc(-n2cnnc2)c1',
 'CC1Oc2ccc(-c3cn4ccsc4n3)cc2N(C(Cc2cccs2)C(CCl)C(=O)c2ccc(-c3ccccc3)cc2)C1=O',
 'Cc1cccc(C2CCCCCN2S(=O)(=O)c2cccc3nsnc23)c1',
 'CCC(C)=C(CC)CCC(=O)NC1CCN(C2CCN(C(=O)[C@H]3CC[C@H](C(=O)OC)C3)CC2)CC1',
 'CCCc1ccc(-c2ccc([C@H](O)C[C@H]3CCCN3C[C@@H]3CCC4(CCCC4)O3)cc2)s1',
 'CC(C)(C)c1nc([NH2+]c2nc(Nc3ccc4c(c3)CNCC4(C)C)ncc2C(O)NC2CC2)cs1',
 'CCc1cc2c(CN3CCN(C(=O)c4cccs4)CC3)cc(=O)oc2cc1-c1c(C)cc(C)cc1C',
 'COc1cc(C(=O)OCCN2CCO[C@H](C)C2)ccc1[N+](=O)[O-]',
 'CCOc1ccccc1CCCNC(=O)c1ccccc1SCc1ccccc1',
 'O=C(CCc1ccc[nH]1)N1CC[C@]2(C1)CN(C(=O)C1=COCC1)CCN2C(=O)Oc1c(Cl)cc(-c2ccccc2)cc1Cl',
 'O=C(CCCCN1CCCCCC1)c1ccc2nc(Cc3ccccc3)c(=O)n(Cc3ccc(F)cc3)c2n1',
 'C[C@@H](F)CCN1CC[C@

In [16]:
combine_prompt = []
for index,mol in enumerate(moles):
    new_p = re.sub(r'<mol>.*?</mol>', '', logp_df.prompt[index]) + ' and still for this molecule ' +  \
        re.sub(r'<mol>.*?</mol>', '', qed_df.prompt[index]) + ' and still for this molecule  ' +\
        re.sub(r'<mol>.*?</mol>', '', donor_df.prompt[index]) + f' This molecule is <mol>{mol}</mol>'
    combine_prompt.append(new_p)
len(combine_prompt),combine_prompt[1]

(150,
 'How can we modify the molecule  to decrease its logP value? and still for this molecule Consider a molecule with the SMILES string . Propose changes that could increase its QED value by at least 0.1 compared to the pre-optimized value to make it more drug-like. and still for this molecule  Support me in transforming the molecule  by incorporating the same hydrogen bond donors. This molecule is <mol>CC=C(C=CC(=C(C)CC)N1CCOC(C(Cc2cccs2)C(O)CCl)C1)Nc1nc(-c2ccc(C=N)c(N)c2)cn2ccnc12</mol>')

In [17]:
import json
with open('/home/v-nianran/src/data/test_qed.metadata.json','r') as f:
    qed_meta = json.load(f)
with open('/home/v-nianran/src/data/test_logp.metadata.json','r') as f:
    logp_meta = json.load(f)
with open('/home/v-nianran/src/data/test_donor.metadata.json','r') as f:
    donor_meta = json.load(f)
qed_meta[80],logp_meta[80],donor_meta[80]

({'source_smiles': 'CC(Cc1cncc(Br)c1)c1ccc(CC(=O)N2C[C@@H]3COC[C@H](C2)[C@@H]3NC(=O)c2cc3c(n2C)-c2ncccc2OC3)nc1',
  'reference_smiles': 'Cn1c(C(=O)N[C@H]2[C@@H]3COC[C@H]2CN(C(=O)Cc2ccc(O)cn2)C3)cc2c1-c1ncccc1OC2',
  'property': 'QED',
  'requirement': 'increase'},
 {'source_smiles': 'C=C(C)C(c1ccccc1OCc1cccc(C(=O)NC(C)c2cccc(Br)c2)c1)C(C)(F)Br',
  'reference_smiles': 'CC(NC(=O)c1cccc(COc2ccccc2C(N)=O)c1)c1cccc(Br)c1',
  'property': 'logP',
  'requirement': 'range, 4, 5'},
 {'source_smiles': 'CC(C)(C)c1cc(C(N)=[NH+]O)ccc1C(=O)NN1CC(C(=O)OCC(=O)c2ccc(Cl)cc2Cl)CC1=O',
  'reference_smiles': 'CC(C)(C)c1ccc(C(=O)NN2CC(C(=O)OCC(=O)c3ccc(Cl)cc3Cl)CC2=O)cc1',
  'property': 'donor',
  'requirement': 'decrease, >=2'})

In [18]:
data = {
    'prompts':combine_prompt,
    'requirements':[
        {'qed_requ': qed_meta[i],
        'logp_requ': logp_meta[i],
        'donor_requ': donor_meta[i]} for i in range(len(combine_prompt))
    ]
}
json_file_path = 'data/uniform150_test.json'

with open(json_file_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [19]:
with open(json_file_path, 'r') as json_file:
    new_data = json.load(json_file,)
new_data['requirements'][0]

{'qed_requ': {'source_smiles': 'O=C([C@H]1CCCC[C@H]1N1CCCC1=O)N1CC2(CC(F)C2)C1',
  'reference_smiles': 'NNC(=O)C(=O)NC1CC2(C1)CN(C(=O)[C@H]1CCCC[C@H]1N1CCCC1=O)C2',
  'property': 'QED',
  'requirement': 'decrease'},
 'logp_requ': {'source_smiles': 'CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1',
  'reference_smiles': 'COc1ccc([C@@H](O)C(=O)N[C@H]2[C@@H]3COC[C@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1',
  'property': 'logP',
  'requirement': 'range, 2, 3'},
 'donor_requ': {'source_smiles': 'O=C(NC[C@H]1CCOc2ccccc21)c1ccc(F)c(C(F)(F)F)c1',
  'reference_smiles': 'CC(C)C[NH+](CC(=O)[O-])C(F)(F)c1cc(C(=O)NC[C@H]2CCOc3ccccc32)ccc1F',
  'property': 'donor',
  'requirement': 'increase'}}

In [54]:
len(new_data['prompts'])

150

## Experiments

In [24]:
mollm = MOLLM('config/base.yaml')
mollm.run()

Found local copy...
Loading...
Done!


Loading ZINC dataset...


  0%|          | 0/3 [00:00<?, ?it/s]

Suggest new molecules based on molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol> and help me decrease the QED value.I have some molecules with their objective values. <mol>CCn1cc(NC(=O)C2CCN(C(=O)CCc3ccccc3Cl)CC2)cn1</mol>,qed:0.8245023098421231,  
<mol>COC(=O)[C@@H](c1ccccc1Cl)N1CCN(C(=O)CCc2ccccc2Cl)CC1</mol>,qed:0.6444932819174114,  
 Give me two new molecules that are different from all points above, and not dominated by any of the above. You can do it by applying crossover on the points I give to you. Please note when you try to achieving these objectives, the molecules given you propose should be similar to the original molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol>. Do not write code. Do not give any explanation. Each output new molecule must start with <mol> and end with </mol> in SIMLE form
Suggest new molecules based on molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=

 33%|███▎      | 1/3 [00:03<00:06,  3.48s/it]

Suggest new molecules based on molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol> and help me decrease the QED value.I have some molecules with their objective values. <mol>CCCCC(CC)COCOc1ccc(C(=O)NC2CCC3C(COc3ccccc3Cl)CCC2C3)cc1</mol>,qed:0.229442435633374,  
<mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol>,qed:0.21636799460509812,  
 Give me two new molecules that are different from all points above, and not dominated by any of the above. You can do it by applying crossover on the points I give to you. Please note when you try to achieving these objectives, the molecules given you propose should be similar to the original molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol>. Do not write code. Do not give any explanation. Each output new molecule must start with <mol> and end with </mol> in SIMLE form
Suggest new molecules based on m

 67%|██████▋   | 2/3 [00:06<00:03,  3.30s/it]

Suggest new molecules based on molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol> and help me decrease the QED value.I have some molecules with their objective values. <mol>CCCCC(CC)COCOc1ccc([C@@H](CO)C(=O)N[C@@H]2CCC3C(COc3ccccc3Cl)CCC2C3)cc1</mol>,qed:0.18847463002625045,  
<mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol>,qed:0.21636799460509812,  
 Give me two new molecules that are different from all points above, and not dominated by any of the above. You can do it by applying crossover on the points I give to you. Please note when you try to achieving these objectives, the molecules given you propose should be similar to the original molecule <mol>CCCCC(CC)COCOc1ccc([C@@H](O)C(=O)N[C@@H]2[C@H]3COC[C@@H]2CN(C(=O)CCc2ccccc2Cl)C3)cc1</mol>. Do not write code. Do not give any explanation. Each output new molecule must start with <mol> and end with </mol> in SIMLE form
Suggest new mol

 67%|██████▋   | 2/3 [00:08<00:04,  4.26s/it]


KeyboardInterrupt: 