In [1]:
import numpy as np
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from mol_tools.tools import QaAgent, SmilesFilter, SMILESEnergyPredictionTool, RetriverTool
from langchain_openai import ChatOpenAI
from dotenv import find_dotenv, load_dotenv
import logging

In [2]:
# set up
logging.basicConfig(level=logging.INFO)
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
llm = ChatOpenAI(model_name="gpt-3.5-turbo") 
embeddings_provider = OpenAIEmbeddings(model="text-embedding-3-small")
# loader = WebBaseLoader("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8264982/")
loader = PyPDFLoader("storage/13045_2021_Article_1121.pdf")
docs = loader.load() 
text = " ".join([page.page_content for page in docs])

# Part 1: Custom Retriever Tool Development

#### Ask questions about the paper

In [3]:
questions = [
    "What disease are they discussing in the paper?",
    "Summarize the main takeaways from the paper.",
    "Can you list the protein targets they highlight in the paper?",
    "Can you list the small molecule drugs they highlight in the paper?",
]

In [5]:
retriever_tool = RetriverTool(
    docs, embeddings_provider, vector_path="storage/PMC8264982.faiss"
)
qa = QaAgent(retriever_tool, llm)
# disable logging
logging.disable(logging.CRITICAL)
for q in questions:
    print("Question:", q, "\n Answer:", qa.answer(q))
#enable logging
logging.disable(logging.NOTSET)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Question: What disease are they discussing in the paper? 
 Answer: The paper is discussing advanced non-small cell lung cancer (NSCLC).


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Question: Summarize the main takeaways from the paper. 
 Answer: The paper discusses the importance of predictive biomarkers in advanced non-small cell lung cancer (NSCLC), focusing on actionable mutations such as ALK, ROS1, EGFR, MET, BRAF, RET, and NTRK. It provides an overview of FDA-approved targeted therapies for these mutations, highlighting the efficacy and common adverse effects associated with each treatment. The paper also discusses the resistance mechanisms that can develop with EGFR inhibitors and the ongoing clinical trials exploring optimal treatment options for patients who have progressed after osimertinib therapy. Overall, the paper emphasizes the importance of personalized medicine in treating advanced NSCLC and the need for further research in this area.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Question: Can you list the protein targets they highlight in the paper? 
 Answer: The protein targets highlighted in the paper include NFE2L2, KEAP1, PI3K, MTORC1, HER2, SHP2, B7-H3, PTK7, EGFR, PTK7, CEACAM5, TROP2, IDO1, BCL2, RXRs, S15, MET/HGFR, PDGFR, FGFR, VEGFR, c-kit, IDO1, p53, Exportin 1, CD27, WEE1, PARP7, and RXRs.


INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Question: Can you list the small molecule drugs they highlight in the paper? 
 Answer: Based on the provided context, the small molecule drugs highlighted in the paper are:
1. TAK228
2. Nazartinib (EGF816)
3. Alflutinib (AST2818)
4. BLU-945
5. BDTX-189
6. CLN-081 (TAS6417)
7. DZD9008
8. Tarloxotinib
9. Poziotinib
10. Mobocertinib (TAK-788)


#### Search for SMILES strings

In [6]:
%%capture
smiles_filter = SmilesFilter()
text = " ".join([page.page_content for page in docs])
smiles_tokens = smiles_filter.filter(text)

INFO:root:Processing 13598 tokens


In [7]:
print("SMILES tokens:", smiles_tokens)

SMILES tokens: ['I/II', 'I/II', 'I/II', 'II', 'III', 'III', 'I/II', 'II', 'II', 'IIB', 'CI', 'II', 'I/II', 'I/II', 'CNS', 'II', 'PPP', 'CI', 'CI', 'CI', 'CI', 'I/II', '\nCNS', 'I/II', 'CNS', 'I/II', '\nI', 'I/II', 'III', 'I/II', '\nOS', 'I/II', 'II', 'CI', 'I/II', 'I/II', 'II', 'II', 'CI', 'I/II', '\nI/II', 'III', 'I/II', 'I/II', 'II', 'IB', 'I/II', 'I/II', '\nI/II', 'CI', 'CI', 'I-II', 'CI', 'II', 'I/II', 'II', 'CI', 'I/II', 'CNS', 'II', 'I/II', 'OS', 'III', 'II', 'I/II', 'III', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', '\nI/II', '\nNCCN', '\nII', 'CI', 'CI', 'CI', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'CNS', 'II', 'II', 'III', 'I/II', 'III', '\nI/II', 'I/II', 'II', 'I/II', 'II', 'II']


# Part 2: Predictive Machine Learning Model Tooling Implementation

#### Trian a model (Regression)

In [9]:
from mol_tools.trainer import train
from sklearn.metrics import mean_absolute_error
from mol_tools.data_utils import get_freesolv_data
X_train, X_test, y_train, y_test = get_freesolv_data()
mlp_model = train(X_train, y_train, model= "mlp_regression")
tree_regressor = train(X_train, y_train, model="tree_regression")
ensamble_regressor = train(X_train, y_train, model="AutoSklearnRegressor")
models = {"mlp": mlp_model, "tree": tree_regressor, "ensamble": ensamble_regressor}

INFO:root:Data loaded with 481 samples
INFO:root:Training <class 'mol_tools.models.SkLearnMLPRegression'> model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
INFO:root:Model saved at storage/_mlp_regression.pkl
INFO:root:Data loaded with 481 samples
INFO:root:Training <class 'mol_tools.models.SkLearnTreeRegression'> model
INFO:root:Model saved at storage/_tree_regression.pkl
INFO:root:Data loaded with 481 samples
INFO:root:Training <class 'mol_tools.models.AutoSklearnRegressorModel'> model


In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
results = {}
for model in models:
    y_hat = models[model].predict(X_test)
    mae = mean_absolute_error(y_test, y_hat)
    results[model] = mae
# barplot
# small size 
plt.figure(figsize=(2, 2))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.ylabel("MAE")
    

NameError: name 'models' is not defined

Here, MLP seems like the best model, lets stick to it.

### Use the model to make poreditions for SMILEs tokens discovered above

In [7]:

from mol_tools.data_utils import get_freesolv_data
X_train, X_test, y_train, y_test = get_freesolv_data()

In [8]:
smiles_tokens = ['I/II', 'I/II', 'I/II', 'II', 'III', 'III', 'I/II', 'II', 'II', 'IIB', 'CI', 'II', 'I/II', 'I/II', 'CNS', 'II', 'PPP', 'CI', 'CI', 'CI', 'CI', 'I/II', '\nCNS', 'I/II', 'CNS', 'I/II', '\nI', 'I/II', 'III', 'I/II', '\nOS', 'I/II', 'II', 'CI', 'I/II', 'I/II', 'II', 'II', 'CI', 'I/II', '\nI/II', 'III', 'I/II', 'I/II', 'II', 'IB', 'I/II', 'I/II', '\nI/II', 'CI', 'CI', 'I-II', 'CI', 'II', 'I/II', 'II', 'CI', 'I/II', 'CNS', 'II', 'I/II', 'OS', 'III', 'II', 'I/II', 'III', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', '\nI/II', '\nNCCN', '\nII', 'CI', 'CI', 'CI', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'CNS', 'II', 'II', 'III', 'I/II', 'III', '\nI/II', 'I/II', 'II', 'I/II', 'II', 'II']
smiles_tokens = np.unique(smiles_tokens)

In [9]:
smiles_temp_predictor = SMILESEnergyPredictionTool(
    model_path="storage/_mlp_regression.pkl"
)
   
for smiles in smiles_tokens:
    print("SMILES:", smiles, "\n Energy:", smiles_temp_predictor.run(smiles))

SMILES: 
CNS 
 Energy: -6.472814943457294
SMILES: 
I 
 Energy: -0.9677948146753201
SMILES: 
I/II 
 Energy: -1.0995580127160332
SMILES: 
II 
 Energy: -2.0305915914540313
SMILES: 
NCCN 
 Energy: -6.971683576325196
SMILES: 
OS 
 Energy: -2.8051132488132233
SMILES: CI 
 Energy: -2.056325737667905
SMILES: CNS 
 Energy: -6.472814943457294
SMILES: I-II 
 Energy: -1.0995580127160332
SMILES: I/II 
 Energy: -1.0995580127160332
SMILES: IB 
 Energy: -1.9819246736701257
SMILES: II 
 Energy: -2.0305915914540313
SMILES: IIB 
 Energy: -1.7200701126943927
SMILES: III 
 Energy: -1.0995580127160332
SMILES: OS 
 Energy: -2.8051132488132233
SMILES: PPP 
 Energy: -1.3624383257479176
