In [8]:
import numpy as np
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from mol_tools.tools import QaAgent, SmilesFilter, SMILESEnergyPredictionTool
from langchain_openai import ChatOpenAI
from dotenv import find_dotenv, load_dotenv
import logging

In [2]:
# set up
logging.basicConfig(level=logging.INFO)
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
llm = ChatOpenAI(model_name="gpt-3.5-turbo") 
embeddings_provider = OpenAIEmbeddings(model="text-embedding-3-small")
# loader = WebBaseLoader("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8264982/")
loader = PyPDFLoader("storage/13045_2021_Article_1121.pdf")
docs = loader.load() 
text = " ".join([page.page_content for page in docs])

# Part 1: Custom Retriever Tool Development

#### Ask questions about the paper

In [8]:
questions = [
    "What disease are they discussing in the paper?",
    "Summarize the main takeaways from the paper.",
    "Can you list the protein targets they highlight in the paper?",
    "Can you list the small molecule drugs they highlight in the paper?",
]

In [10]:
qa = QaAgent(
        docs, llm, embeddings_provider, vector_path="storage/PMC8264982.faiss"
    )
for q in questions:
    print("Question:", q, "\n Answer:", qa.answer(q))

Question: What disease are they discussing in the paper? 
 Answer: The paper discusses advanced non-small cell lung cancer (NSCLC).
Question: Summarize the main takeaways from the paper. 
 Answer: The paper discusses the importance of predictive biomarkers in advanced non-small cell lung cancer (NSCLC), focusing on EGFR mutations as oncogenic drivers in a subset of patients. The use of EGFR inhibitors, such as osimertinib, is highlighted as standard of care in the first-line metastatic setting for untreated EGFR mutant NSCLC. Resistance mechanisms to EGFR TKIs, including acquired resistance and treatment options after progression on osimertinib, are also discussed. Ongoing clinical trials, such as ORCHARD, Checkmate 722, and KEYNOTE-789, are exploring optimal treatment strategies for patients with EGFR-mutant NSCLC based on their underlying resistance mechanisms. The distribution of actionable mutations in advanced lung adenocarcinoma, along with available targeted therapies and their 

#### Search for SMILES strings

In [3]:
%%capture
smiles_filter = SmilesFilter()
text = " ".join([page.page_content for page in docs])
smiles_tokens = smiles_filter.filter(text)

INFO:root:Processing 13598 tokens


In [5]:
print("SMILES tokens:", smiles_tokens)

SMILES tokens: ['I/II', 'I/II', 'I/II', 'II', 'III', 'III', 'I/II', 'II', 'II', 'IIB', 'CI', 'II', 'I/II', 'I/II', 'CNS', 'II', 'PPP', 'CI', 'CI', 'CI', 'CI', 'I/II', '\nCNS', 'I/II', 'CNS', 'I/II', '\nI', 'I/II', 'III', 'I/II', '\nOS', 'I/II', 'II', 'CI', 'I/II', 'I/II', 'II', 'II', 'CI', 'I/II', '\nI/II', 'III', 'I/II', 'I/II', 'II', 'IB', 'I/II', 'I/II', '\nI/II', 'CI', 'CI', 'I-II', 'CI', 'II', 'I/II', 'II', 'CI', 'I/II', 'CNS', 'II', 'I/II', 'OS', 'III', 'II', 'I/II', 'III', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', '\nI/II', '\nNCCN', '\nII', 'CI', 'CI', 'CI', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'CNS', 'II', 'II', 'III', 'I/II', 'III', '\nI/II', 'I/II', 'II', 'I/II', 'II', 'II']


# Part 2: Predictive Machine Learning Model Tooling Implementation

#### Trian a model (Regression)

In [6]:
from mol_tools.trainer import train
train()

In [8]:
#eval model on test data
import pickle
from mol_tools.models import MODELS, Model
from sklearn.metrics import mean_absolute_error
from mol_tools.data_utils import get_freesolv_data
X_train, X_test, y_train, y_test = get_freesolv_data()

with open("storage/_AutoSklearnRegressor.pkl", "rb") as f:
    model:Model = pickle.load(f)

y_hat = model.predict(X_test)
mae = mean_absolute_error(y_test, y_hat)

In [9]:
mae # mean absolute value

1.373892990216831

### Use the model to make poreditions for SMILEs tokens discovered above

In [9]:
smiles_tokens = ['I/II', 'I/II', 'I/II', 'II', 'III', 'III', 'I/II', 'II', 'II', 'IIB', 'CI', 'II', 'I/II', 'I/II', 'CNS', 'II', 'PPP', 'CI', 'CI', 'CI', 'CI', 'I/II', '\nCNS', 'I/II', 'CNS', 'I/II', '\nI', 'I/II', 'III', 'I/II', '\nOS', 'I/II', 'II', 'CI', 'I/II', 'I/II', 'II', 'II', 'CI', 'I/II', '\nI/II', 'III', 'I/II', 'I/II', 'II', 'IB', 'I/II', 'I/II', '\nI/II', 'CI', 'CI', 'I-II', 'CI', 'II', 'I/II', 'II', 'CI', 'I/II', 'CNS', 'II', 'I/II', 'OS', 'III', 'II', 'I/II', 'III', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'I/II', '\nI/II', '\nNCCN', '\nII', 'CI', 'CI', 'CI', 'II', 'I/II', 'I/II', 'I/II', 'I/II', 'II', 'CNS', 'II', 'II', 'III', 'I/II', 'III', '\nI/II', 'I/II', 'II', 'I/II', 'II', 'II']
smiles_tokens = np.unique(smiles_tokens)

In [10]:
smiles_temp_predictor = SMILESEnergyPredictionTool(
    model_path="storage/_AutoSklearnRegressor.pkl"
)
   
for smiles in smiles_tokens:
    print("SMILES:", smiles, "\n Energy:", smiles_temp_predictor.run(smiles))

SMILES: 
CNS 
 Energy: -4.174153208732605
SMILES: 
I 
 Energy: -2.567687451839447
SMILES: 
I/II 
 Energy: -2.3948360681533813
SMILES: 
II 
 Energy: -2.4692990481853485
SMILES: 
NCCN 
 Energy: -5.0190078020095825
SMILES: 
OS 
 Energy: -3.0381375551223755
SMILES: CI 
 Energy: -2.6678661108016968
SMILES: CNS 
 Energy: -4.174153208732605
SMILES: I-II 
 Energy: -2.3948360681533813
SMILES: I/II 
 Energy: -2.3948360681533813
SMILES: IB 
 Energy: -2.4980756044387817
SMILES: II 
 Energy: -2.4692990481853485
SMILES: IIB 
 Energy: -2.4729931950569153
SMILES: III 
 Energy: -2.3948360681533813
SMILES: OS 
 Energy: -3.0381375551223755
SMILES: PPP 
 Energy: -2.5146504342556
