In [None]:
import openai
from tqdm.notebook import tqdm
import pandas as pd
import os
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2

In [None]:

# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'Cytochrome_P450_enzymes'

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/cyp/chembl_cyp_raw_data.csv')
df = df.drop_duplicates('Assay Description')
condition = ' \n '.join(df['Assay Description'][0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}


Please analyze the document and return a list of the top ten most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
experimental_conditions = ['Enzyme Source',
 'Incubation Time',
 'Temperature Range',
 'pH Level',
 'Substrate Concentration',
 'Inhibitor Concentration',
 'Cofactors',
 'Detection Method',
 'Protein Expression System',
 'Cell Type']

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Example Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
df = pd.read_csv('../data/raw_data/cyp/chembl_cyp_raw_data.csv')

In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,10)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+10])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/{prop}/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/{prop}/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Inhibition of CYP2D6 \n Inhibition of CYP3A4 \n Inhibition of CYP2C9 \n Inhibition of CYP3A4 (unknown origin) \n Inhibition of CYP2C19 \n Inhibition of CYP2C9 (unknown origin) \n Inhibition of CYP2D6 (unknown origin) \n Inhibition of human placental microsome CYP19 \n GSK_TCAKS: pIC50 CYP51 from Trypanosoma cruzi. \n Inhibition of human CYP3A4 \n Inhibition of human CYP11B1 expressed in chinese hamster V79 cells \n Inhibition of CYP1A2 \n Inhibition of human recombinant CYP2D6 \n Inhibition of human CYP2C9 \n Inhibition of human CYP11B2 expressed in chinese hamster V79 cells \n Inhibition of human CYP11B1 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate \n Inhibition of CYP2C19 (unknown origin) \n Inhibition of human CYP2D6 \n Inhibition of CYP1A2 (unknown origin) \n Inhibition of human recombinant CYP3A4'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Inhibition of CYP2D6', 'Inhibition of CYP3A4', 'Inhibition of CYP2C9', 'Inhibition of CYP3A4 (unknown origin)', 'Inhibition of CYP2C19', 'Inhibition of CYP2C9 (unknown origin)', 'Inhibition of CYP2D6 (unknown origin)', 'Inhibition of human placental microsome CYP19', 'GSK_TCAKS: pIC50 CYP51 from Trypanosoma cruzi.', 'Inhibition of human CYP3A4', 'Inhibition of human CYP11B1 expressed in chinese hamster V79 cells', 'Inhibition of CYP1A2', 'Inhibition of human recombinant CYP2D6', 'Inhibition of human CYP2C9', 'Inhibition of human CYP11B2 expressed in chinese hamster V79 cells', 'Inhibition of human CYP11B1 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate', 'Inhibition of CYP2C19 (unknown origin)', 'Inhibition of human CYP2D6', 'Inhibition of CYP1A2 (unknown origin)', 'Inhibition of human recombinant CYP3A4'], 'Enzyme Source': ['CYP2D6', 'CYP3A4', 'CYP2C9', 'CYP3A4', 'CYP2C19', 'CYP2C9', 'CYP2D6', 'CYP19', 'CYP51 from Trypanosoma cruzi', 'CYP3A4', 'CYP11B1', 'CYP1A2', 'CYP2D6', 'CYP2C9', 'CYP11B2', 'CYP11B1', 'CYP2C19', 'CYP2D6', 'CYP1A2', 'CYP3A4'], 'Incubation Time': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'pH Level': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Substrate Concentration': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '[1,2-3H]-11-deoxycorticosterone', nan, nan, nan, nan], 'Inhibitor Concentration': [nan, nan, nan, nan, nan, nan, nan, nan, 'pIC50', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Cofactors': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Detection Method': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Protein Expression System': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'chinese hamster V79 cells', nan, 'recombinant', nan, 'chinese hamster V79 cells', 'hamster V79MZh cells', nan, nan, nan, 'recombinant'], 'CYP sources': [nan, nan, nan, 'unknown origin', nan, 'unknown origin', 'unknown origin', 'human placental microsome ', nan, 'human ', 'human ', nan, 'human recombinant ', 'human', 'human', 'human', 'unknown origin', 'human', 'unknown origin', 'human recombinant '], 'Cytochrome_P450_enzymes': [True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Inhibition of human CYP2C19 \n DRUGMATRIX: CYP450, 2D6 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin) \n Inhibition of human CYP11B2 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate \n Inhibition of human recombinant CYP1A2 \n In vitro inhibitory concentration against human placental CYP19 incubated with 500 nM of substrate androstenedione in presence of the compound \n Inhibition of human CYP11B2 expressed in hamster V79 MZh cells \n In vitro inhibitory concentration against human CYP11B2 expressed in V79MZh hamster fibroblasts incubated with 100 nM of substrate deoxy-corticosterone in presence of the compound \n DNDI: CYP Inhibition \n Inhibition of human recombinant CYP2C9 \n Inhibition of CYP2C9 in human liver microsomes \n Inhibition of CYP3A4 in human liver microsomes \n Inhibition of CYP8B1 in human liver microsomes using 7alpha-hydroxy-4-cholesten-3-one as substrate preincubated followed by substrate addition in presence of NADPH and measured after 45 mins by ESI-MS analysis \n DRUGMATRIX: CYP450, 2C19 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin) \n Inhibition of human CYP11B1 expressed in hamster V79 MZh cells \n Reversible inhibition of CYP3A4 \n Inhibition of recombinant CYP21 (unknown origin) expressed in human AD293 cells using 17alpha-hydroxyprogesterone as substrate pretreated for 60 mins followed by substrate addition and measured after 45 mins by LC/MS analysis \n Inhibition of recombinant CYP17 (unknown origin) expressed in human AD293 cells using 21[3H]-17alpha-hydroxyl-pregnenolone as substrate pretreated for 60 mins followed by substrate addition and measured after 4 hrs by Topcount method \n Inhibition of CYP2D6 in human liver microsomes \n Inhibition of CYP3A4 in human liver microsomes using midazolam as substrate after 10 mins by LC/MS/MS analysis \n DRUGMATRIX: CYP450, 3A4 enzyme inhibition (substrate: 7-Benzyloxy-4-(trifluoromethyl)-coumarin)'

Output:
```python\n{{'index': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'original sentence': ['Inhibition of human CYP2C19', 'DRUGMATRIX: CYP450, 2D6 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin)', 'Inhibition of human CYP11B2 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate', 'Inhibition of human recombinant CYP1A2', 'In vitro inhibitory concentration against human placental CYP19 incubated with 500 nM of substrate androstenedione in presence of the compound', 'Inhibition of human CYP11B2 expressed in hamster V79 MZh cells', 'In vitro inhibitory concentration against human CYP11B2 expressed in V79MZh hamster fibroblasts incubated with 100 nM of substrate deoxy-corticosterone in presence of the compound', 'DNDI: CYP Inhibition', 'Inhibition of human recombinant CYP2C9', 'Inhibition of CYP2C9 in human liver microsomes', 'Inhibition of CYP3A4 in human liver microsomes', 'Inhibition of CYP8B1 in human liver microsomes using 7alpha-hydroxy-4-cholesten-3-one as substrate preincubated followed by substrate addition in presence of NADPH and measured after 45 mins by ESI-MS analysis', 'DRUGMATRIX: CYP450, 2C19 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin)', 'Inhibition of human CYP11B1 expressed in hamster V79 MZh cells', 'Reversible inhibition of CYP3A4', 'Inhibition of recombinant CYP21 (unknown origin) expressed in human AD293 cells using 17alpha-hydroxyprogesterone as substrate pretreated for 60 mins followed by substrate addition and measured after 45 mins by LC/MS analysis', 'Inhibition of recombinant CYP17 (unknown origin) expressed in human AD293 cells using 21[3H]-17alpha-hydroxyl-pregnenolone as substrate pretreated for 60 mins followed by substrate addition and measured after 4 hrs by Topcount method', 'Inhibition of CYP2D6 in human liver microsomes', 'Inhibition of CYP3A4 in human liver microsomes using midazolam as substrate after 10 mins by LC/MS/MS analysis', 'DRUGMATRIX: CYP450, 3A4 enzyme inhibition (substrate: 7-Benzyloxy-4-(trifluoromethyl)-coumarin)'], 'Enzyme Source': ['CYP2C19', 'CYP2D6', 'CYP11B2', 'CYP1A2', 'CYP19', 'CYP11B2', 'CYP11B2', nan, 'CYP2C9', 'CYP2C9', 'CYP3A4', 'CYP8B1', 'CYP2C19', 'CYP11B1', 'CYP3A4', 'CYP21', 'CYP17', 'CYP2D6', 'CYP3A4', 'CYP3A4'], 'Incubation Time': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '45 mins', nan, nan, nan, '60 mins pretreatment, 45 mins post-substrate addition', '60 mins pretreatment, 4 hrs post-substrate addition', nan, '10 mins', nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'pH Level': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Substrate Concentration': [nan, '3-Cyano-7-ethoxycoumarin', '[1,2-3H]-11-deoxycorticosterone', nan, '500 nM androstenedione', nan, '100 nM deoxy-corticosterone', nan, nan, nan, nan, '7alpha-hydroxy-4-cholesten-3-one', '3-Cyano-7-ethoxycoumarin', nan, nan, '17alpha-hydroxyprogesterone', '21[3H]-17alpha-hydroxyl-pregnenolone', nan, 'midazolam', '7-Benzyloxy-4-(trifluoromethyl)-coumarin'], 'Inhibitor Concentration': [nan, nan, nan, nan, 'inhibitory concentration in vitro', nan, 'inhibitory concentration in vitro', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Cofactors': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'NADPH', nan, nan, nan, nan, nan, nan, nan, nan], 'Detection Method': [nan, 'DRUGMATRIX', nan, nan, nan, nan, nan, 'DNDI', nan, nan, nan, 'ESI-MS', 'DRUGMATRIX', nan, nan, 'LC/MS', 'Topcount', nan, 'LC/MS/MS', 'DRUGMATRIX'], 'Protein Expression System': [nan, nan, 'hamster V79MZh cells', 'recombinant', nan, 'hamster V79 MZh cells', 'V79MZh hamster fibroblasts', nan, 'recombinant', nan, nan, nan, nan, 'hamster V79 MZh cells', nan, 'human AD293 cells', 'human AD293 cells', nan, nan, nan], 'CYP sources': ['human', nan, 'human', 'human recombinant ', 'human placental', 'human', 'human', nan, 'human recombinant ', 'human', 'human', 'human', nan, 'human', nan, 'unknown origin', 'unknown origin', 'human', 'human', nan], 'Cytochrome_P450_enzymes': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```
"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/raw_data/cyp/chembl_cyp_raw_data.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = ['Enzyme Source',
 'Incubation Time',
 'Temperature Range',
 'pH Level',
 'Substrate Concentration',
 'Inhibitor Concentration',
 'Cofactors',
 'Detection Method',
 'Protein Expression System',
 'CYP sources']

experimental_conditions_list = ', '.join(experimental_conditions_list)

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Inhibition of CYP2D6 \n Inhibition of CYP3A4 \n Inhibition of CYP2C9 \n Inhibition of CYP3A4 (unknown origin) \n Inhibition of CYP2C19 \n Inhibition of CYP2C9 (unknown origin) \n Inhibition of CYP2D6 (unknown origin) \n Inhibition of human placental microsome CYP19 \n GSK_TCAKS: pIC50 CYP51 from Trypanosoma cruzi. \n Inhibition of human CYP3A4 \n Inhibition of human CYP11B1 expressed in chinese hamster V79 cells \n Inhibition of CYP1A2 \n Inhibition of human recombinant CYP2D6 \n Inhibition of human CYP2C9 \n Inhibition of human CYP11B2 expressed in chinese hamster V79 cells \n Inhibition of human CYP11B1 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate \n Inhibition of CYP2C19 (unknown origin) \n Inhibition of human CYP2D6 \n Inhibition of CYP1A2 (unknown origin) \n Inhibition of human recombinant CYP3A4'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Inhibition of CYP2D6', 'Inhibition of CYP3A4', 'Inhibition of CYP2C9', 'Inhibition of CYP3A4 (unknown origin)', 'Inhibition of CYP2C19', 'Inhibition of CYP2C9 (unknown origin)', 'Inhibition of CYP2D6 (unknown origin)', 'Inhibition of human placental microsome CYP19', 'GSK_TCAKS: pIC50 CYP51 from Trypanosoma cruzi.', 'Inhibition of human CYP3A4', 'Inhibition of human CYP11B1 expressed in chinese hamster V79 cells', 'Inhibition of CYP1A2', 'Inhibition of human recombinant CYP2D6', 'Inhibition of human CYP2C9', 'Inhibition of human CYP11B2 expressed in chinese hamster V79 cells', 'Inhibition of human CYP11B1 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate', 'Inhibition of CYP2C19 (unknown origin)', 'Inhibition of human CYP2D6', 'Inhibition of CYP1A2 (unknown origin)', 'Inhibition of human recombinant CYP3A4'], 'Enzyme Source': ['CYP2D6', 'CYP3A4', 'CYP2C9', 'CYP3A4', 'CYP2C19', 'CYP2C9', 'CYP2D6', 'CYP19', 'CYP51 from Trypanosoma cruzi', 'CYP3A4', 'CYP11B1', 'CYP1A2', 'CYP2D6', 'CYP2C9', 'CYP11B2', 'CYP11B1', 'CYP2C19', 'CYP2D6', 'CYP1A2', 'CYP3A4'], 'Incubation Time': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'pH Level': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Substrate Concentration': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '[1,2-3H]-11-deoxycorticosterone', nan, nan, nan, nan], 'Inhibitor Concentration': [nan, nan, nan, nan, nan, nan, nan, nan, 'pIC50', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Cofactors': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Detection Method': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Protein Expression System': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'chinese hamster V79 cells', nan, 'recombinant', nan, 'chinese hamster V79 cells', 'hamster V79MZh cells', nan, nan, nan, 'recombinant'], 'CYP sources': [nan, nan, nan, 'unknown origin', nan, 'unknown origin', 'unknown origin', 'human placental microsome ', nan, 'human ', 'human ', nan, 'human recombinant ', 'human', 'human', 'human', 'unknown origin', 'human', 'unknown origin', 'human recombinant '], 'Cytochrome_P450_enzymes': [True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Inhibition of human CYP2C19 \n DRUGMATRIX: CYP450, 2D6 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin) \n Inhibition of human CYP11B2 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate \n Inhibition of human recombinant CYP1A2 \n In vitro inhibitory concentration against human placental CYP19 incubated with 500 nM of substrate androstenedione in presence of the compound \n Inhibition of human CYP11B2 expressed in hamster V79 MZh cells \n In vitro inhibitory concentration against human CYP11B2 expressed in V79MZh hamster fibroblasts incubated with 100 nM of substrate deoxy-corticosterone in presence of the compound \n DNDI: CYP Inhibition \n Inhibition of human recombinant CYP2C9 \n Inhibition of CYP2C9 in human liver microsomes \n Inhibition of CYP3A4 in human liver microsomes \n Inhibition of CYP8B1 in human liver microsomes using 7alpha-hydroxy-4-cholesten-3-one as substrate preincubated followed by substrate addition in presence of NADPH and measured after 45 mins by ESI-MS analysis \n DRUGMATRIX: CYP450, 2C19 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin) \n Inhibition of human CYP11B1 expressed in hamster V79 MZh cells \n Reversible inhibition of CYP3A4 \n Inhibition of recombinant CYP21 (unknown origin) expressed in human AD293 cells using 17alpha-hydroxyprogesterone as substrate pretreated for 60 mins followed by substrate addition and measured after 45 mins by LC/MS analysis \n Inhibition of recombinant CYP17 (unknown origin) expressed in human AD293 cells using 21[3H]-17alpha-hydroxyl-pregnenolone as substrate pretreated for 60 mins followed by substrate addition and measured after 4 hrs by Topcount method \n Inhibition of CYP2D6 in human liver microsomes \n Inhibition of CYP3A4 in human liver microsomes using midazolam as substrate after 10 mins by LC/MS/MS analysis \n DRUGMATRIX: CYP450, 3A4 enzyme inhibition (substrate: 7-Benzyloxy-4-(trifluoromethyl)-coumarin)'

Output:
```python\n{{'index': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'original sentence': ['Inhibition of human CYP2C19', 'DRUGMATRIX: CYP450, 2D6 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin)', 'Inhibition of human CYP11B2 expressed in hamster V79MZh cells using [1,2-3H]-11-deoxycorticosterone as substrate', 'Inhibition of human recombinant CYP1A2', 'In vitro inhibitory concentration against human placental CYP19 incubated with 500 nM of substrate androstenedione in presence of the compound', 'Inhibition of human CYP11B2 expressed in hamster V79 MZh cells', 'In vitro inhibitory concentration against human CYP11B2 expressed in V79MZh hamster fibroblasts incubated with 100 nM of substrate deoxy-corticosterone in presence of the compound', 'DNDI: CYP Inhibition', 'Inhibition of human recombinant CYP2C9', 'Inhibition of CYP2C9 in human liver microsomes', 'Inhibition of CYP3A4 in human liver microsomes', 'Inhibition of CYP8B1 in human liver microsomes using 7alpha-hydroxy-4-cholesten-3-one as substrate preincubated followed by substrate addition in presence of NADPH and measured after 45 mins by ESI-MS analysis', 'DRUGMATRIX: CYP450, 2C19 enzyme inhibition (substrate: 3-Cyano-7-ethoxycoumarin)', 'Inhibition of human CYP11B1 expressed in hamster V79 MZh cells', 'Reversible inhibition of CYP3A4', 'Inhibition of recombinant CYP21 (unknown origin) expressed in human AD293 cells using 17alpha-hydroxyprogesterone as substrate pretreated for 60 mins followed by substrate addition and measured after 45 mins by LC/MS analysis', 'Inhibition of recombinant CYP17 (unknown origin) expressed in human AD293 cells using 21[3H]-17alpha-hydroxyl-pregnenolone as substrate pretreated for 60 mins followed by substrate addition and measured after 4 hrs by Topcount method', 'Inhibition of CYP2D6 in human liver microsomes', 'Inhibition of CYP3A4 in human liver microsomes using midazolam as substrate after 10 mins by LC/MS/MS analysis', 'DRUGMATRIX: CYP450, 3A4 enzyme inhibition (substrate: 7-Benzyloxy-4-(trifluoromethyl)-coumarin)'], 'Enzyme Source': ['CYP2C19', 'CYP2D6', 'CYP11B2', 'CYP1A2', 'CYP19', 'CYP11B2', 'CYP11B2', nan, 'CYP2C9', 'CYP2C9', 'CYP3A4', 'CYP8B1', 'CYP2C19', 'CYP11B1', 'CYP3A4', 'CYP21', 'CYP17', 'CYP2D6', 'CYP3A4', 'CYP3A4'], 'Incubation Time': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '45 mins', nan, nan, nan, '60 mins pretreatment, 45 mins post-substrate addition', '60 mins pretreatment, 4 hrs post-substrate addition', nan, '10 mins', nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'pH Level': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Substrate Concentration': [nan, '3-Cyano-7-ethoxycoumarin', '[1,2-3H]-11-deoxycorticosterone', nan, '500 nM androstenedione', nan, '100 nM deoxy-corticosterone', nan, nan, nan, nan, '7alpha-hydroxy-4-cholesten-3-one', '3-Cyano-7-ethoxycoumarin', nan, nan, '17alpha-hydroxyprogesterone', '21[3H]-17alpha-hydroxyl-pregnenolone', nan, 'midazolam', '7-Benzyloxy-4-(trifluoromethyl)-coumarin'], 'Inhibitor Concentration': [nan, nan, nan, nan, 'inhibitory concentration in vitro', nan, 'inhibitory concentration in vitro', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Cofactors': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'NADPH', nan, nan, nan, nan, nan, nan, nan, nan], 'Detection Method': [nan, 'DRUGMATRIX', nan, nan, nan, nan, nan, 'DNDI', nan, nan, nan, 'ESI-MS', 'DRUGMATRIX', nan, nan, 'LC/MS', 'Topcount', nan, 'LC/MS/MS', 'DRUGMATRIX'], 'Protein Expression System': [nan, nan, 'hamster V79MZh cells', 'recombinant', nan, 'hamster V79 MZh cells', 'V79MZh hamster fibroblasts', nan, 'recombinant', nan, nan, nan, nan, 'hamster V79 MZh cells', nan, 'human AD293 cells', 'human AD293 cells', nan, nan, nan], 'CYP sources': ['human', nan, 'human', 'human recombinant ', 'human placental', 'human', 'human', nan, 'human recombinant ', 'human', 'human', 'human', nan, 'human', nan, 'unknown origin', 'unknown origin', 'human', 'human', nan], 'Cytochrome_P450_enzymes': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```
"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
len(assay_description)

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/{prop}/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
import pandas as pd
root_path = '../data/data_mining_results/Cytochrome_P450_enzymes/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence', 'Enzyme Source', 'Incubation Time',
       'Temperature Range', 'pH Level', 'Substrate Concentration',
       'Inhibitor Concentration', 'Cofactors', 'Detection Method',
       'Protein Expression System', 'CYP sources', 'Cytochrome_P450_enzymes']]

In [None]:
result.columns = ['Assay Description','Enzyme Source', 'Incubation Time',
       'Temperature Range', 'pH Level', 'Substrate Concentration',
       'Inhibitor Concentration', 'Cofactors', 'Detection Method',
       'Protein Expression System', 'CYP sources', 'Cytochrome_P450_enzymes']

In [None]:
df = pd.read_csv('../data/raw_data/cyp/chembl_cyp_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year']
df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')
df

In [None]:
df.to_csv('../data/data_mining_results/Cytochrome_P450_enzymes/chembl_cyp_data_mining_finished.csv',index=False)