In [None]:

import openai
from tqdm.notebook import tqdm
import pandas as pd
import os
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2


In [None]:

# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'Liver_microsomes_clearance'

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/cl/chembl_cl_raw_data.csv')

condition = ' \n '.join(df['Assay Description'].value_counts().keys()[0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}, only analysis the in vitro experiments


Please analyze the document and return a list of the top ten most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
experimental_conditions = [
    'Compound Concentration',
    'Incubation Time',
    'Presence of NADPH/NADP',
    'Enzyme Source',
    'Temperature Range',
    'Analytical Technique',
    'Species',
    'Route of Administration',
    'Type of Microsomes',
    'Protein Amount or Microsomal Protein Concentration'
]


In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Example Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
df = pd.read_csv('../data/cl/chembl_cl_combined.csv')

In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,5)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+5])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/{prop}/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/{prop}/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))



f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Clearance in human after iv administration \n Intrinsic clearance in human liver microsomes \n Intrinsic clearance in mouse liver microsomes \n Metabolic stability in human liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in rat liver microsomes \n Clearance in human liver microsomes \n Intrinsic clearance in human microsomes \n DNDI: in vitro intrinsic clearance in mouse liver microsomes at a concentration of 1uM \n Metabolic stability in mouse liver microsomes assessed as intrinsic clearance \n DNDI: Metabolism \n Clearance in rat liver microsomes \n Clearance in rat at 1 mg/kg, iv \n Clearance in rat after iv administration \n Metabolic stability in mouse liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis \n Clearance in mouse liver microsomes \n Plasma clearance in rat \n Metabolic stability in human liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis \n Intrinsic clearance in rat microsomes \n Metabolic stability in rat liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in CD1 mouse liver microsomes at 5 uM incubated up to 30 mins in presence of NADPH by UPLC-UV analysis'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], 'original sentence': ['Clearance in human after iv administration', 'Intrinsic clearance in human liver microsomes', 'Intrinsic clearance in mouse liver microsomes', 'Metabolic stability in human liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in rat liver microsomes', 'Clearance in human liver microsomes', 'Intrinsic clearance in human microsomes', 'DNDI: in vitro intrinsic clearance in mouse liver microsomes at a concentration of 1uM', 'Metabolic stability in mouse liver microsomes assessed as intrinsic clearance', 'DNDI: Metabolism', 'Clearance in rat liver microsomes', 'Clearance in rat at 1 mg/kg, iv', 'Clearance in rat after iv administration', 'Metabolic stability in mouse liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis', 'Clearance in mouse liver microsomes', 'Plasma clearance in rat', 'Metabolic stability in human liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis', 'Intrinsic clearance in rat microsomes', 'Metabolic stability in rat liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in CD1 mouse liver microsomes at 5 uM incubated up to 30 mins in presence of NADPH by UPLC-UV analysis'], 'Compound Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', '1uM', 'none', 'none', 'none', '1 mg/kg', 'none', '0.5 uM', 'none', 'none', '0.5 uM', 'none', 'none', '5 uM'], 'Incubation Time': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '30 mins', 'none', 'none', '30 mins', 'none', 'none', 'up to 30 mins'], 'Presence of NADPH/NADP': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'NADPH'], 'Enzyme Source': ['none', 'liver', 'liver', 'liver', 'liver', 'liver', 'none', 'liver', 'liver', 'none', 'liver', 'none', 'none', 'liver', 'liver', 'none', 'liver', 'none', 'liver', 'liver'], 'Temperature Range': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Analytical Technique': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'LC-MS/MS', 'none', 'none', 'LC-MS/MS', 'none', 'none', 'UPLC-UV'], 'Species': ['human', 'human', 'mouse', 'human', 'rat', 'human', 'human', 'mouse', 'mouse', 'none', 'rat', 'rat', 'rat', 'mouse', 'mouse', 'rat', 'human', 'rat', 'rat', 'CD1 mouse'], 'Route of Administration': ['iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'iv', 'iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Type of Microsomes': ['none', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'microsomes', 'liver microsomes', 'liver microsomes', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'liver microsomes', 'none', 'liver microsomes', 'microsomes', 'liver microsomes', 'liver microsomes'], 'Protein Amount or Microsomal Protein Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Liver_microsomes_clearance experiment': [False, True, True, True, True, True, True, True, True, False, True, False, False, True, True, False, True, True, True, True]}}\n```

Example 2
Input: 'Microsomal stability in human liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in human liver microsomes at 1 to 2.5 uM in presence of NADPH \n Clearance in iv dosed mouse \n Clearance in rat \n Clearance in human liver microsomes at 1 uM incubated for 60 mins followed by NADPH generating system addition and measured by LC-MS/MS analysis \n Clearance in Sprague-Dawley rat at 1 mg/kg, iv by LC-MS/MS analysis \n Clearance in Sprague-Dawley rat at 1 mg/kg, iv \n Intrinsic clearance in human liver microsome \n Clearance in human hepatocytes \n Clearance in rat at 2 mg/kg, iv \n Intrinsic clearance in mouse liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis \n Intrinsic clearance in human liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis \n Metabolic stability in human microsomes assessed as intrinsic clearance \n Clearance in rat hepatocytes \n Intrinsic clearance in human hepatocytes \n Metabolic stability in mouse liver microsomes assessed as clearance \n Unbound clearance in rat \n Intrinsic clearance in mouse microsomes \n Clearance in rat at 0.5 mg/kg, iv \n Intrinsic clearance in mouse microsomes preincubated for 5 mins followed by addition of 1.5 mM of NADPH measured after 5 to 30 mins by LC-MS/MS analysis'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], 'original sentence': ['Microsomal stability in human liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in human liver microsomes at 1 to 2.5 uM in presence of NADPH', 'Clearance in iv dosed mouse', 'Clearance in rat', 'Clearance in human liver microsomes at 1 uM incubated for 60 mins followed by NADPH generating system addition and measured by LC-MS/MS analysis', 'Clearance in Sprague-Dawley rat at 1 mg/kg, iv by LC-MS/MS analysis', 'Clearance in Sprague-Dawley rat at 1 mg/kg, iv', 'Intrinsic clearance in human liver microsome', 'Clearance in human hepatocytes', 'Clearance in rat at 2 mg/kg, iv', 'Intrinsic clearance in mouse liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis', 'Intrinsic clearance in human liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis', 'Metabolic stability in human microsomes assessed as intrinsic clearance', 'Clearance in rat hepatocytes', 'Intrinsic clearance in human hepatocytes', 'Metabolic stability in mouse liver microsomes assessed as clearance', 'Unbound clearance in rat', 'Intrinsic clearance in mouse microsomes', 'Clearance in rat at 0.5 mg/kg, iv', 'Intrinsic clearance in mouse microsomes preincubated for 5 mins followed by addition of 1.5 mM of NADPH measured after 5 to 30 mins by LC-MS/MS analysis'], 'Compound Concentration': ['none', '1 to 2.5 uM', 'none', 'none', '1 uM', '1 mg/kg', '1 mg/kg', 'none', 'none', '2 mg/kg', '1 uM', '1 uM', 'none', 'none', 'none', 'none', 'none', 'none', '0.5 mg/kg', 'none'], 'Incubation Time': ['none', 'none', 'none', 'none', '60 mins', 'none', 'none', 'none', 'none', 'none', 'up to 30 mins', 'up to 30 mins', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '5 to 30 mins'], 'Presence of NADPH/NADP': ['none', 'NADPH', 'none', 'none', 'NADPH generating system', 'none', 'none', 'none', 'none', 'none', 'NADPH', 'NADPH', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '1.5 mM NADPH'], 'Enzyme Source': ['liver', 'liver', 'none', 'none', 'liver', 'none', 'none', 'liver', 'hepatocytes', 'none', 'liver', 'liver', 'none', 'hepatocytes', 'hepatocytes', 'liver', 'none', 'none', 'none', 'none'], 'Temperature Range': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Analytical Technique': ['none', 'none', 'none', 'none', 'LC-MS/MS', 'LC-MS/MS', 'none', 'none', 'none', 'none', 'UPLC-MS/MS', 'UPLC-MS/MS', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'LC-MS/MS'], 'Species': ['human', 'human', 'mouse', 'rat', 'human', 'Sprague-Dawley rat', 'Sprague-Dawley rat', 'human', 'human', 'rat', 'mouse', 'human', 'human', 'rat', 'human', 'mouse', 'rat', 'mouse', 'rat', 'mouse'], 'Route of Administration': ['none', 'none', 'iv', 'none', 'none', 'iv', 'iv', 'none', 'none', 'iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'iv', 'none'], 'Type of Microsomes': ['liver microsomes', 'liver microsomes', 'none', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'liver microsomes', 'microsomes', 'none', 'none', 'liver microsomes', 'none', 'microsomes', 'none', 'microsomes'], 'Protein Amount or Microsomal Protein Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Liver_microsomes_clearance experiment': [True, True, False, False, True, False, False, True, False, False, True, True, True, False, False, True, False, True, False, True]}}\n```

"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/cl/chembl_cl_combined.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = [
    'Compound Concentration',
    'Incubation Time',
    'Presence of NADPH/NADP',
    'Enzyme Source',
    'Temperature Range',
    'Analytical Technique',
    'Species',
    'Route of Administration',
    'Type of Microsomes',
    'Protein Amount or Microsomal Protein Concentration'
]


experimental_conditions_list = ', '.join(experimental_conditions_list)
experimental_conditions_list

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Clearance in human after iv administration \n Intrinsic clearance in human liver microsomes \n Intrinsic clearance in mouse liver microsomes \n Metabolic stability in human liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in rat liver microsomes \n Clearance in human liver microsomes \n Intrinsic clearance in human microsomes \n DNDI: in vitro intrinsic clearance in mouse liver microsomes at a concentration of 1uM \n Metabolic stability in mouse liver microsomes assessed as intrinsic clearance \n DNDI: Metabolism \n Clearance in rat liver microsomes \n Clearance in rat at 1 mg/kg, iv \n Clearance in rat after iv administration \n Metabolic stability in mouse liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis \n Clearance in mouse liver microsomes \n Plasma clearance in rat \n Metabolic stability in human liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis \n Intrinsic clearance in rat microsomes \n Metabolic stability in rat liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in CD1 mouse liver microsomes at 5 uM incubated up to 30 mins in presence of NADPH by UPLC-UV analysis'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], 'original sentence': ['Clearance in human after iv administration', 'Intrinsic clearance in human liver microsomes', 'Intrinsic clearance in mouse liver microsomes', 'Metabolic stability in human liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in rat liver microsomes', 'Clearance in human liver microsomes', 'Intrinsic clearance in human microsomes', 'DNDI: in vitro intrinsic clearance in mouse liver microsomes at a concentration of 1uM', 'Metabolic stability in mouse liver microsomes assessed as intrinsic clearance', 'DNDI: Metabolism', 'Clearance in rat liver microsomes', 'Clearance in rat at 1 mg/kg, iv', 'Clearance in rat after iv administration', 'Metabolic stability in mouse liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis', 'Clearance in mouse liver microsomes', 'Plasma clearance in rat', 'Metabolic stability in human liver microsomes assessed as intrinsic clearance at 0.5 uM incubated for 30 mins by LC-MS/MS analysis', 'Intrinsic clearance in rat microsomes', 'Metabolic stability in rat liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in CD1 mouse liver microsomes at 5 uM incubated up to 30 mins in presence of NADPH by UPLC-UV analysis'], 'Compound Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', '1uM', 'none', 'none', 'none', '1 mg/kg', 'none', '0.5 uM', 'none', 'none', '0.5 uM', 'none', 'none', '5 uM'], 'Incubation Time': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '30 mins', 'none', 'none', '30 mins', 'none', 'none', 'up to 30 mins'], 'Presence of NADPH/NADP': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'NADPH'], 'Enzyme Source': ['none', 'liver', 'liver', 'liver', 'liver', 'liver', 'none', 'liver', 'liver', 'none', 'liver', 'none', 'none', 'liver', 'liver', 'none', 'liver', 'none', 'liver', 'liver'], 'Temperature Range': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Analytical Technique': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'LC-MS/MS', 'none', 'none', 'LC-MS/MS', 'none', 'none', 'UPLC-UV'], 'Species': ['human', 'human', 'mouse', 'human', 'rat', 'human', 'human', 'mouse', 'mouse', 'none', 'rat', 'rat', 'rat', 'mouse', 'mouse', 'rat', 'human', 'rat', 'rat', 'CD1 mouse'], 'Route of Administration': ['iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'iv', 'iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Type of Microsomes': ['none', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'liver microsomes', 'microsomes', 'liver microsomes', 'liver microsomes', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'liver microsomes', 'none', 'liver microsomes', 'microsomes', 'liver microsomes', 'liver microsomes'], 'Protein Amount or Microsomal Protein Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Liver_microsomes_clearance experiment': [False, True, True, True, True, True, True, True, True, False, True, False, False, True, True, False, True, True, True, True]}}\n```

Example 2
Input: 'Microsomal stability in human liver microsomes assessed as intrinsic clearance \n Intrinsic clearance in human liver microsomes at 1 to 2.5 uM in presence of NADPH \n Clearance in iv dosed mouse \n Clearance in rat \n Clearance in human liver microsomes at 1 uM incubated for 60 mins followed by NADPH generating system addition and measured by LC-MS/MS analysis \n Clearance in Sprague-Dawley rat at 1 mg/kg, iv by LC-MS/MS analysis \n Clearance in Sprague-Dawley rat at 1 mg/kg, iv \n Intrinsic clearance in human liver microsome \n Clearance in human hepatocytes \n Clearance in rat at 2 mg/kg, iv \n Intrinsic clearance in mouse liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis \n Intrinsic clearance in human liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis \n Metabolic stability in human microsomes assessed as intrinsic clearance \n Clearance in rat hepatocytes \n Intrinsic clearance in human hepatocytes \n Metabolic stability in mouse liver microsomes assessed as clearance \n Unbound clearance in rat \n Intrinsic clearance in mouse microsomes \n Clearance in rat at 0.5 mg/kg, iv \n Intrinsic clearance in mouse microsomes preincubated for 5 mins followed by addition of 1.5 mM of NADPH measured after 5 to 30 mins by LC-MS/MS analysis'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4], 'original sentence': ['Microsomal stability in human liver microsomes assessed as intrinsic clearance', 'Intrinsic clearance in human liver microsomes at 1 to 2.5 uM in presence of NADPH', 'Clearance in iv dosed mouse', 'Clearance in rat', 'Clearance in human liver microsomes at 1 uM incubated for 60 mins followed by NADPH generating system addition and measured by LC-MS/MS analysis', 'Clearance in Sprague-Dawley rat at 1 mg/kg, iv by LC-MS/MS analysis', 'Clearance in Sprague-Dawley rat at 1 mg/kg, iv', 'Intrinsic clearance in human liver microsome', 'Clearance in human hepatocytes', 'Clearance in rat at 2 mg/kg, iv', 'Intrinsic clearance in mouse liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis', 'Intrinsic clearance in human liver microsomes at 1 uM measured up to 30 mins in presence of NADPH by UPLC-MS/MS analysis', 'Metabolic stability in human microsomes assessed as intrinsic clearance', 'Clearance in rat hepatocytes', 'Intrinsic clearance in human hepatocytes', 'Metabolic stability in mouse liver microsomes assessed as clearance', 'Unbound clearance in rat', 'Intrinsic clearance in mouse microsomes', 'Clearance in rat at 0.5 mg/kg, iv', 'Intrinsic clearance in mouse microsomes preincubated for 5 mins followed by addition of 1.5 mM of NADPH measured after 5 to 30 mins by LC-MS/MS analysis'], 'Compound Concentration': ['none', '1 to 2.5 uM', 'none', 'none', '1 uM', '1 mg/kg', '1 mg/kg', 'none', 'none', '2 mg/kg', '1 uM', '1 uM', 'none', 'none', 'none', 'none', 'none', 'none', '0.5 mg/kg', 'none'], 'Incubation Time': ['none', 'none', 'none', 'none', '60 mins', 'none', 'none', 'none', 'none', 'none', 'up to 30 mins', 'up to 30 mins', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '5 to 30 mins'], 'Presence of NADPH/NADP': ['none', 'NADPH', 'none', 'none', 'NADPH generating system', 'none', 'none', 'none', 'none', 'none', 'NADPH', 'NADPH', 'none', 'none', 'none', 'none', 'none', 'none', 'none', '1.5 mM NADPH'], 'Enzyme Source': ['liver', 'liver', 'none', 'none', 'liver', 'none', 'none', 'liver', 'hepatocytes', 'none', 'liver', 'liver', 'none', 'hepatocytes', 'hepatocytes', 'liver', 'none', 'none', 'none', 'none'], 'Temperature Range': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Analytical Technique': ['none', 'none', 'none', 'none', 'LC-MS/MS', 'LC-MS/MS', 'none', 'none', 'none', 'none', 'UPLC-MS/MS', 'UPLC-MS/MS', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'LC-MS/MS'], 'Species': ['human', 'human', 'mouse', 'rat', 'human', 'Sprague-Dawley rat', 'Sprague-Dawley rat', 'human', 'human', 'rat', 'mouse', 'human', 'human', 'rat', 'human', 'mouse', 'rat', 'mouse', 'rat', 'mouse'], 'Route of Administration': ['none', 'none', 'iv', 'none', 'none', 'iv', 'iv', 'none', 'none', 'iv', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'iv', 'none'], 'Type of Microsomes': ['liver microsomes', 'liver microsomes', 'none', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'none', 'none', 'liver microsomes', 'liver microsomes', 'microsomes', 'none', 'none', 'liver microsomes', 'none', 'microsomes', 'none', 'microsomes'], 'Protein Amount or Microsomal Protein Concentration': ['none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none'], 'Liver_microsomes_clearance experiment': [True, True, False, False, True, False, False, True, False, False, True, True, True, False, False, True, False, True, False, True]}}\n```

"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
len(assay_description)

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/{prop}/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
import pandas as pd
root_path = '../data/data_mining_results/Liver_microsomes_clearance/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence',  'Compound Concentration',
       'Incubation Time', 'Presence of NADPH/NADP', 'Enzyme Source',
       'Temperature Range', 'Analytical Technique', 'Species',
       'Route of Administration', 'Type of Microsomes',
       'Protein Amount or Microsomal Protein Concentration',
       'Liver_microsomes_clearance experiment']]

In [None]:
result.columns = ['Assay Description','Compound Concentration',
       'Incubation Time', 'Presence of NADPH/NADP', 'Enzyme Source',
       'Temperature Range', 'Analytical Technique', 'Species',
       'Route of Administration', 'Type of Microsomes',
       'Protein Amount or Microsomal Protein Concentration',
       'Liver_microsomes_clearance experiment']

In [None]:
df = pd.read_csv('../data/raw_data/cl/chembl_cl_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year']
df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')
df

In [None]:
df.to_csv('../data/data_mining_results/Liver_microsomes_clearance/cl_Chembl_data_mining_finished.csv',index=False)