In [50]:
import openai
import pandas as pd
import lmql
import numpy as np
import pandas as pd
import json

In [12]:
toxic_smiles = "CC1=C(C(=O)C2=C(C1=O)N3C[C@H]4[C@@H]([C@@]3([C@@H]2COC(=O)N)OC)N4)N"
prompt = f"Analyze the SMILES sequence below and classify it as either 'Hepatotoxicity' or 'NonHepatotoxicity': '{toxic_smiles}'"

response = openai.ChatCompletion.create(
    model="gpt-4o", messages=[{"role": "user", "content": prompt}]
)

In [13]:
classifications = response["choices"][0]["message"]["content"].splitlines()
classifications

["To classify the given SMILES sequence as 'Hepatotoxicity' or 'NonHepatotoxicity', we need to analyze the chemical structure it represents and assess its potential for liver toxicity. The SMILES you've provided represents a chemical structure that can be quite complex to interpret directly without a specific chemical database or computational tools to predict toxicity.",
 '',
 'Here are some key steps to evaluate the hepatotoxicity of the molecule:',
 '',
 '1. **Structural Features**: Evaluate if the SMILES string corresponds to a structure with known hepatotoxic functional groups or substructures, such as nitro groups, certain aromatic amines, or specific halogenated compounds.',
 '',
 '2. **Literature and Databases**: Utilize cheminformatics tools or databases to match the SMILES against known hepatotoxic compounds. Resources like PubChem, ChemSpider, or specific toxicology databases could be helpful.',
 '',
 '3. **Toxicity Prediction Software**: Use in silico methods like QSAR (Qua

# Few shot learning

In [2]:
# Sample DataFrame with SMILES strings
data = pd.DataFrame(
    {
        "Smiles": [
            "C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN(C2)C3=C(C=C4C(=O)C(=CN(C4=N3)C5=C(C=C(C=C5)F)F)C(=O)O)F)N",
            "C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl",
            "C1=CC(=CC=C1N/C(=N/C(=NCCCCCCN=C(/N=C(/NC2=CC=C(C=C2)Cl)\\N)N)N)/N)Cl",
        ],
        "Class": ["Hepatotoxicity", "Hepatotoxicity", "NonHepatotoxicity"],
    }
)

data

Unnamed: 0,Smiles,Class
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity
2,C1=CC(=CC=C1N/C(=N/C(=NCCCCCCN=C(/N=C(/NC2=CC=...,NonHepatotoxicity


In [79]:
# Define few-shot examples to guide the model
few_shot_examples = """
1. SMILES: S=C=Nc1c2c(ccc1)cccc2
Classification: Hepatotoxicity

2. SMILES: Clc1ccc(C[C@@H](NC(=O)[C@H](NC(=O)C)Cc2cc3c(cc2)cccc3)C(=O)N[C@@H](C(=O)N[C@H](C(=O)N([C@H](C(=O)N[C@@H](C(=O)N[C@H](C(=O)N[C@H](C(=O)N2[C@@H](CCC2)C(=O)N[C@H](C)C(=O)N)CCCCNC(C)C)CC(C)C)CC(=O)N)Cc2ccc(O)cc2)C)CO)Cc2cccnc2)cc1
Classification: Hepatotoxicity

3. SMILES: O(c1c([C@]23C[C@@H]4C[C@H](C2)C[C@H](C3)C4)cc(cc1)c1cc2c(cc1)cc(cc2)C(=O)O)C
Classification: NonHepatotoxicity
"""

# Define the LMQL query with few-shot examples
@lmql.query(model="gpt-4", temperature=0)
async def classify_smiles(few_shot_examples, smiles):
    '''lmql
    "**Context**: Below are a few examples of SMILES strings with their toxicity classification.\n"
    "** Task**: Please use these examples to classify the new SMILES strings as either 'Hepatotoxicity' or 'NonHepatotoxicity'.\n"

    "**Instructions**: analyze the chemical structure it represents and assess its potential for liver toxicity\n"
    "1. Structural Features: Evaluate if the SMILES string corresponds to a structure with known hepatotoxic functional groups or substructures, such as nitro groups, certain aromatic amines, or specific halogenated compounds.\n"
    "2. Literature and Databases: Utilize cheminformatics tools or databases to match the SMILES against known hepatotoxic compounds. Resources like PubChem, ChemSpider, or specific toxicology databases could be helpful.\n"
    "3. Experimental Data and Reports: Check scientific literature or toxicology reports for any experimental data or case studies related to substances with similar structures.\n"
    "4. Output the answer is its predicted classification in list format.\n"
    
    # Few-shot examples
    {few_shot_examples}

    # Q&A prompt template
    "Q: Analyze the SMILES sequence below and classify it as either 'Hepatotoxicity' or 'NonHepatotoxicity': {smiles}\n"
    "A: Let's think step by step.\n"
    "[REASONING]"
    "Provide the classification for each SMILES string in a Python list format: [ANSWER]."
    
    # return just the ANSWER to the caller
    return ANSWER    

    '''

In [80]:
classifications = await classify_smiles(few_shot_examples, ["C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl", "C1=CC(=CC=C1N/C(=N/C(=NCCCCCCN=C(/N=C(/NC2=CC=C(C=C2)Cl)\\N)N)N)/N)Cl"])
classifications

" ['Hepatotoxicity', 'Hepatotoxicity']"

In [81]:
eval(classifications)[0]

'Hepatotoxicity'

In [82]:
# Function to classify a batch of SMILES strings
async def classify_smiles_batch(smiles_batch):
    # Run the LMQL query with batch input
    response = await classify_smiles(few_shot_examples, smiles_batch)
    
    # Evaluate response to convert string list format into an actual Python list
    try:
        classifications = eval(response)  # Assumes response is in Python list format
        return classifications
    except Exception as e:
        print(f"Error parsing response: {e}")
        return [None] * len(smiles_batch)  # Return None for failed parsing

# Process the data in batches
batch_size = 2
smiles_batches = [data['Smiles'][i:i + batch_size].tolist() for i in range(0, len(data), batch_size)]
results = []

# Classify each batch and store results
for batch in smiles_batches:
    batch_results = await classify_smiles_batch(batch)
    results.extend(batch_results)

# Add results to DataFrame
data['Classification'] = results
print(data)

                                              Smiles              Class  \
0  C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...     Hepatotoxicity   
1                      C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl     Hepatotoxicity   
2  C1=CC(=CC=C1N/C(=N/C(=NCCCCCCN=C(/N=C(/NC2=CC=...  NonHepatotoxicity   

      Classification  
0     Hepatotoxicity  
1  NonHepatotoxicity  
2     Hepatotoxicity  


# Batch prediction

In [93]:
# Define the LMQL query with few-shot examples
@lmql.query(model="gpt-4", temperature=0, max_len=8192)
async def classify_smiles(few_shot_examples, smiles):
    '''lmql
    "**Context**: Below are a few examples of SMILES strings with their toxicity classification.\n"
    "** Task**: Please use these examples to classify the new SMILES strings as either 'Hepatotoxicity' or 'NonHepatotoxicity'.\n"

    "**Instructions**: analyze the chemical structure it represents and assess its potential for liver toxicity\n"
    "1. Structural Features: Evaluate if the SMILES string corresponds to a structure with known hepatotoxic functional groups or substructures, such as nitro groups, certain aromatic amines, or specific halogenated compounds.\n"
    "2. Literature and Databases: Utilize cheminformatics tools or databases to match the SMILES against known hepatotoxic compounds. Resources like PubChem, ChemSpider, or specific toxicology databases could be helpful.\n"
    "3. Experimental Data and Reports: Check scientific literature or toxicology reports for any experimental data or case studies related to substances with similar structures.\n"
    "4. Output the answer is its predicted classification in list format.\n"
    
    # Few-shot examples
    {few_shot_examples}

    # Q&A prompt template
    "Q: Analyze the SMILES sequence below and classify it as either 'Hepatotoxicity' or 'NonHepatotoxicity': {smiles}\n"
    "A: Let's think step by step.\n"
    "[REASONING]"
    "Provide the classification for each SMILES string in a Python list format: [ANSWER]."
    
    # return just the ANSWER to the caller
    return ANSWER    

    '''

In [89]:
# Load pd_train
pd_train = pd.read_csv("data_smiles/Training_Group.csv")
pd_train["label"] = pd_train["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_train.shape)
pd_train["label"].value_counts()

(1241, 3)


label
1    683
0    558
Name: count, dtype: int64

In [90]:
# Load pd_test
pd_test = pd.read_csv("data_smiles/Testing_Group.csv")
pd_test["label"] = pd_test["Liver"].apply(lambda x: 1 if x == "Hepatotoxicity" else 0)
print(pd_test.shape)
pd_test["label"].value_counts()

(286, 3)


label
1    221
0     65
Name: count, dtype: int64

In [94]:
# Sample 10 examples, aiming for a balanced selection if possible
sampled_data = pd_train.groupby("label").apply(lambda x: x.sample(n=5, random_state=42)).reset_index(drop=True)
print(sampled_data[["Smiles", "Liver"]])

# Generate the few-shot examples
few_shot_examples = ""
for i, row in enumerate(sampled_data.itertuples(), 1):
    few_shot_examples += f"{i}. SMILES: {row.Smiles}\nClassification: {row.Liver}\n\n"

print("Few-shot Examples:\n", few_shot_examples)

                                              Smiles              Liver
0  O=C1[C@@]2([C@H]([C@H]3[C@H](CC2)[C@@]2(C(=CC(...  NonHepatotoxicity
1  O([C@@H](COC(=O)[C@@H](N)C(C)C)CO)Cn1c2[nH]c(n...  NonHepatotoxicity
2    Clc1c(S[C@@H](CCc2ccc(Cl)cc2)Cn2ccnc2)c(Cl)ccc1  NonHepatotoxicity
3  FC(F)([C@@]1(O[C@H]2[C@@H]([C@H](C(=O)C2)CCCCC...  NonHepatotoxicity
4             Clc1cc2nccc(N[C@@H](CCCN(CC)CC)C)c2cc1  NonHepatotoxicity
5  ClC1=C(N2[C@@H]([C@H](NC(=O)[C@H](N)c3ccccc3)C...     Hepatotoxicity
6  O1[C@@H]([C@H](NC(=O)[C@H](NC(=O)[C@H](NC(=O)[...     Hepatotoxicity
7          Clc1c2c([C@@H](CNCC2)c2ccc(O)cc2)cc(O)c1O     Hepatotoxicity
8              Clc1ccc([C@H]2S(=O)(=O)CCC(=O)N2C)cc1     Hepatotoxicity
9  S1(=O)(=O)N(/C(=C(/O)\Nc2sc(cn2)C)/C(=O)c2c1cc...     Hepatotoxicity
Few-shot Examples:
 1. SMILES: O=C1[C@@]2([C@H]([C@H]3[C@H](CC2)[C@@]2(C(=CC(=O)C=C2)C(=C)C3)C)CC1)C
Classification: NonHepatotoxicity

2. SMILES: O([C@@H](COC(=O)[C@@H](N)C(C)C)CO)Cn1c2[nH]c(nc(=O)c2nc1)N
Cl

  sampled_data = pd_train.groupby("label").apply(lambda x: x.sample(n=5, random_state=42)).reset_index(drop=True)


In [95]:
classifications = await classify_smiles(few_shot_examples, ["C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl", "C1=CC(=CC=C1N/C(=N/C(=NCCCCCCN=C(/N=C(/NC2=CC=C(C=C2)Cl)\\N)N)N)/N)Cl"])
classifications


 (<class 'TimeoutError'>)

Retrying... (attempt: 0)


" ['Hepatotoxicity', 'Hepatotoxicity']"

In [99]:
# Function to classify a batch of SMILES strings
from time import sleep

async def classify_smiles_batch(smiles_batch):
    # Run the LMQL query with batch input
    response = await classify_smiles(few_shot_examples, smiles_batch)
    
    # Evaluate response to convert string list format into an actual Python list
    try:
        classifications = eval(response)  # Assumes response is in Python list format
        return classifications
    except Exception as e:
        print(f"Error parsing response: {e}")
        return [None] * len(smiles_batch)  # Return None for failed parsing

# Process the data in batches
batch_size = 10
data = pd_test.copy()
smiles_batches = [data['Smiles'][i:i + batch_size].tolist() for i in range(0, len(data), batch_size)]
results = []

# Classify each batch and store results
print("Number of batches:", len(smiles_batches))
for i, batch in enumerate(smiles_batches):
    print(f"Processing batch {i+1}/{len(smiles_batches)}...")
    batch_results = await classify_smiles_batch(batch)
    results.extend(batch_results)
    sleep(10)

# Add results to DataFrame
data['Classification'] = results
data.head()

Number of batches: 29
Processing batch 1/29...
Processing batch 2/29...
Processing batch 3/29...
Processing batch 4/29...
Processing batch 5/29...
Processing batch 6/29...
Processing batch 7/29...
Processing batch 8/29...
Processing batch 9/29...
Processing batch 10/29...
Processing batch 11/29...
Processing batch 12/29...
Processing batch 13/29...
Processing batch 14/29...
Processing batch 15/29...
Processing batch 16/29...
Processing batch 17/29...
Processing batch 18/29...
Processing batch 19/29...
Processing batch 20/29...
Processing batch 21/29...
Processing batch 22/29...
Processing batch 23/29...
Processing batch 24/29...
Processing batch 25/29...
Processing batch 26/29...
Processing batch 27/29...
Processing batch 28/29...



 (<class 'TimeoutError'>)

Retrying... (attempt: 0)


Processing batch 29/29...


Unnamed: 0,Smiles,Liver,label,Classification
0,C[C@@H](C(=O)N[C@@H](C)C(=O)NC1[C@H]2[C@@H]1CN...,Hepatotoxicity,1,NonHepatotoxicity
1,C=CCOC1=C(C=C(C=C1)CC(=O)O)Cl,Hepatotoxicity,1,Hepatotoxicity
2,CCCN(CCC)C(=O)CC1=C(N=C2N1C=C(C=C2)Cl)C3=CC=C(...,Hepatotoxicity,1,Hepatotoxicity
3,C1CC2=CC=CC=C2C(C3=CC=CC=C31)NCCCCCCC(=O)O,Hepatotoxicity,1,NonHepatotoxicity
4,C1=CC=C(C=C1)CN2C3=CC=CC=C3C(=N2)OCC(=O)O,Hepatotoxicity,1,NonHepatotoxicity


In [102]:
# strip all classification values
data["Classification"] = data["Classification"].apply(lambda x: x.strip() if x is not None else x)

In [103]:
data["Classification"].value_counts()

Classification
Hepatotoxicity       158
NonHepatotoxicity    128
Name: count, dtype: int64

In [105]:
# calculate accuracy
accuracy = (data["Liver"] == data["Classification"]).mean()
print(f"Accuracy: {accuracy:.2%}")


Accuracy: 54.20%


In [106]:
# save the results
data.to_csv("output/Testing_Group_gpt4_Results.csv", index=False)