In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file (if present)
load_dotenv()

from Tools.evaluation_tools import read_json_file, load_automol_model, guide_prompt, automol_predict, write_to_file
from agents import get_data_agent, get_mcp_model_agent, AUTHORIZED_IMPORTS
from mcp_server.manage_mcp_tools import MCPServerControl

from smolagents import (
    CodeAgent,
    LiteLLMModel,
    ToolCollection
)

#gemini
#model = OpenAIServerModel(
#    model_id="gemini-2.5-pro-exp-03-25",
#    api_base="https://generativelanguage.googleapis.com/v1beta/openai/",
#    api_key=os.environ["GEMINI_API_KEY"],
#    temperature=0.2
#)

#ollama
#model = LiteLLMModel(
#  model_id='ollama_chat/granite3.2:8b-instruct-q4_K_M'
#)

#claude
#3.5 does not provide sdf file to training agent, 3.7 does
#model = LiteLLMModel("claude-3-7-sonnet-20250219", temperature=0.2)
#model = LiteLLMModel("claude-3-5-haiku-20241022", temperature=0.2)

#openrouter
#model = LiteLLMModel("openrouter/qwen/qwen3-235b-a22b:free", temperature=0.2)
model = LiteLLMModel("openrouter/meta-llama/llama-4-maverick", temperature=0.2)
#model = LiteLLMModel("openrouter/anthropic/claude-3.7-sonnet", temperature=0.2)
#model = LiteLLMModel("openrouter/anthropic/claude-3.5-haiku", temperature=0.2)
#model = LiteLLMModel("openrouter/qwen/qwen3-30b-a3b:free", temperature=0.2)

#opening mcp server mcp_control, don't forget to close (see end of notebook)
mcp_control=MCPServerControl(['http://127.0.0.1:8000/sse'])
mcp_data_tools=mcp_control.get_tools()

#opening mcp server mcp_model_control, don't forget to close (see end of notebook)
mcp_model_control=MCPServerControl(['http://127.0.0.1:8001/sse'])
mcp_model_tools=mcp_model_control.get_tools()

In [None]:
manager_agent = CodeAgent(
    tools=[read_json_file, write_to_file, load_automol_model, automol_predict],
    model=model,
    max_steps=10,
    managed_agents=[get_data_agent(model,mcp_data_tools), get_mcp_model_agent(model,mcp_model_tools)],
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    add_base_tools=True
)

In [None]:
def display_gen_output(output):
    if isinstance(output,dict):
        for key,item in output.items():
            print(item)
    else:
        print(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': 'Can you train a classification model for Caco2_Wang from the therapeutic data commons (tdc)? '}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': """Can you predict the properties using automol model located in automol_example_model.pt
    using the smiles from in the column smiles in ../Data/ChEMBL_SMILES.csv? and save as a new csv. No need to create any plots"""}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': 'Can you train a model for prop1 in ../Data/ChEMBL_SMILES.csv?'}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': """Can you train a regression model using automol for the data set named Caco2_Wang using the provided validation set from the therapeutic data commons (tdc)
     with the smiles in column Drug and the target in column Y with the smallest execution time. Train 5 models with different combinations of used features. 
     Use the following features to select from: Bottleneck, rdkit, fps_1024_2 and maccs.
     Can you additionaly create a scatter plot with the predicted values of the y-axis and the true values on the x-axis for the 5 combinations of selected features?"""}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt':  """Can you train a regression model using automol for the csv file in ../Data/ChEMBL_SMILES.csv with the smiles in column smiles and the target in column prop1 with the smallest execution time. 
    Can you additionally add column prop2 as property feature for the blender during training.  Firstly remove rows with nan values for prop1 and prop2 from the data file. 
    Divide the data before training 5 models with different combinations of used features. 
     Use the following features to select from: Bottleneck, rdkit, fps_1024_2 and maccs.
     Can you additionaly create a scatter plot with the predicted values of the y-axis and the true values on the x-axis for the 5 combinations of selected features?"""}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': """Can you train a classification model using automol for the csv file in ../Data/ChEMBL_SMILES.csv with the smiles in column smiles and the target in column prop5 with the smallest execution time.
    The target is already categorical. Firstly remove rows with nan values for prop5 from the data file. 
    Divide the data before training 5 models with different combinations of used features. 
     Use the following features to select from: Bottleneck, rdkit, fps_1024_2 and maccs.
     Can you additionaly create a confusion matrix plots for the 5 combinations of selected features?"""}
)
display_gen_output(output)

In [None]:
#model = LiteLLMModel("claude-3-7-sonnet-20250219", temperature=0.2)
model = LiteLLMModel("claude-3-5-haiku-20241022", temperature=0.2)

manager_agent = CodeAgent(
    tools=[read_json_file,write_to_file, load_automol_model, automol_predict],
    model=model,
    max_steps=10,
    managed_agents=[get_data_agent(model,mcp_data_tools), get_mcp_model_agent(model,mcp_model_tools)],
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    add_base_tools=True
)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': """Can you train a regression model using automol for the data set named ABL, with the sdf_file located in ../Data/manuscript_data/ABL/Selected_dockings.sdf and the pdb folder in ../Data/manuscript_data/ABL/pdbs?
     The target I am interesed in is pChEMBL, use the smallest execution time. Train 5 models with different combinations of used features. 
     Use the following features to select from: Bottleneck, rdkit, fps_1024_2, prolif and maccs.
     Can you additionaly create a scatter plot with the predicted values of the y-axis and the true values on the x-axis for the 5 combinations of selected features?"""}
)
display_gen_output(output)

In [None]:
output=manager_agent.run(
    guide_prompt(), additional_args={'user_prompt': """Can you train a classification model using automol for the data set named ABL, with the sdf_file located in ../Data/manuscript_data/ABL/Selected_dockings.sdf and the pdb folder in ../Data/manuscript_data/ABL/pdbs?
     The target I am interesed in is pChEMBL, use the smallest execution time. Train 5 models with different combinations of used features. 
     The target is continuous and needs to be divide in two classes, use the threshold 7. Firstly remove rows with nan values for the target from the data file. 
     Divide the data before training 5 models with different combinations of used features, include prolif atleast once. 
     Use the following features to select from: Bottleneck, rdkit, fps_1024_2, prolif with all interactions and maccs.
     Can you additionaly create a confusion matrix plots for the 5 combinations of selected features?"""}
)
display_gen_output(output)

In [None]:
mcp_control.close()
mcp_model_control.close()