In [None]:
import openai
from tqdm.notebook import tqdm
import pandas as pd
import os 
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2

In [None]:
# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'Ames'

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/ames/chembl_ames_raw_data.csv')
df = df.drop_duplicates('Assay Description')
condition = ' \n '.join(df['Assay Description'][0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}


Please analyze the document and return a list of the top five most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
experimental_conditions = ['Concentration/Dosage Levels',
 'Temperature Range',
 'Time Duration',
 'Metabolic Activation Presence (e.g., S9 fraction in Ames test)',
 'Cell/Tissue Type or Organism Used']

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Examples Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,20)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/{prop}/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/{prop}/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))



f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (HEL cells) \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (Ba/F3 cells) \n Genotoxicity in Salmonella typhimurium by Ames test \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: Ba/F3 Cytotoxicity \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: JAK2V617F Inhibition \n Mutagenicity in Salmonella typhimurium by Ames test \n Induction of phospholipidosis in bovine corneal fibroblasts assessed as lamellar inclusion bodies after 72 hrs by light microscopy \n Neuroprotective activity against kainate-induced cell damage in mouse primary cortical neurons assessed as reduction in kainate-induced neurofilament loss by measuring increase in MAPK level at 1 uM pretreated followed by kainate challenge measured after 18 hrs by fluorescence assay \n Antiviral activity against HCV infected in human HuH5.2 cells assessed as inhibition of subgenomic RNA replication after 4 days by Steady-Glo luciferase assay \n Antibacterial activity against trimethoprim-sulfamethoxazole-resistant Stenotrophomonas maltophilia assessed as inhibition of microbial growth up to 50 ug/ml incubated for 24 hrs by microdilution assay \n Mutagenicity in Salmonella typhimurium TA100 in presence of liver S9 fraction by AMES test \n Mutagenicity in Salmonella typhimurium TA98 in presence of liver S9 fraction by AMES test \n Antiviral activity against Vaccinia virus WR in BSC1 cells assessed as retention of antiviral potency by measuring exogenous double stranded DNA trapping \n Genotoxicity in Salmonella typhimurium TA98 by Ames test in presence of S9 fractions \n Genotoxicity in Salmonella typhimurium TAMix by Ames test in presence of S9 fractions \n Mutagenicity in Salmonella typhimurium TA98 by AMES test \n Mutagenicity in Salmonella typhimurium TA100 by AMES test \n Antibacterial activity against drug-resistant Pseudomonas aeruginosa clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Acinetobacter calcoaceticus clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Serratia marcescens clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (HEL cells)', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (Ba/F3 cells)', 'Genotoxicity in Salmonella typhimurium by Ames test', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: Ba/F3 Cytotoxicity', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: JAK2V617F Inhibition', 'Mutagenicity in Salmonella typhimurium by Ames test', 'Induction of phospholipidosis in bovine corneal fibroblasts assessed as lamellar inclusion bodies after 72 hrs by light microscopy', 'Neuroprotective activity against kainate-induced cell damage in mouse primary cortical neurons assessed as reduction in kainate-induced neurofilament loss by measuring increase in MAPK level at 1 uM pretreated followed by kainate challenge measured after 18 hrs by fluorescence assay', 'Antiviral activity against HCV infected in human HuH5.2 cells assessed as inhibition of subgenomic RNA replication after 4 days by Steady-Glo luciferase assay', 'Antibacterial activity against trimethoprim-sulfamethoxazole-resistant Stenotrophomonas maltophilia assessed as inhibition of microbial growth up to 50 ug/ml incubated for 24 hrs by microdilution assay', 'Mutagenicity in Salmonella typhimurium TA100 in presence of liver S9 fraction by AMES test', 'Mutagenicity in Salmonella typhimurium TA98 in presence of liver S9 fraction by AMES test', 'Antiviral activity against Vaccinia virus WR in BSC1 cells assessed as retention of antiviral potency by measuring exogenous double stranded DNA trapping', 'Genotoxicity in Salmonella typhimurium TA98 by Ames test in presence of S9 fractions', 'Genotoxicity in Salmonella typhimurium TAMix by Ames test in presence of S9 fractions', 'Mutagenicity in Salmonella typhimurium TA98 by AMES test', 'Mutagenicity in Salmonella typhimurium TA100 by AMES test', 'Antibacterial activity against drug-resistant Pseudomonas aeruginosa clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test', 'Antibacterial activity against drug-resistant Acinetobacter calcoaceticus clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test', 'Antibacterial activity against drug-resistant Serratia marcescens clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test'], 'Concentration/Dosage Levels': [nan, nan, nan, nan, nan, nan, nan, '1 uM', nan, 'up to 50 ug/ml', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Time Duration': [nan, nan, nan, nan, nan, nan, '72 hrs', '18 hrs', '4 days', '24 hrs', nan, nan, nan, nan, nan, nan, nan, '24 hrs', '24 hrs', '24 hrs'], 'Metabolic Activation Presence': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'liver S9 fraction', 'liver S9 fraction', nan, 'S9 fractions', 'S9 fractions', nan, nan, nan, nan, nan], 'Cell/Tissue Type or Organism Used': ['HEL cells', 'Ba/F3 cells', 'Salmonella typhimurium', 'Ba/F3', nan, 'Salmonella typhimurium', 'bovine corneal fibroblasts', 'mouse primary cortical neurons', 'human HuH5.2 cells', 'Stenotrophomonas maltophilia', 'Salmonella typhimurium TA100', 'Salmonella typhimurium TA98', 'BSC1 cells', 'Salmonella typhimurium TA98', 'Salmonella typhimurium TAMix', 'Salmonella typhimurium TA98', 'Salmonella typhimurium TA100', 'Pseudomonas aeruginosa', 'Acinetobacter calcoaceticus', 'Serratia marcescens'], 'Ames experiment': [False, False, True, False, False, True, False, False, False, False, True, True, False, True, True, True, True, False, False, False]}}\n```

Example 2
Input: 'Antibacterial activity against drug-resistant Enterococcus faecalis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Escherichia coli clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Proteus mirabilis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Klebsiella pneumoniae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Enterobacter cloacae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Genotoxicity in Salmonella typhimurium TA98 by Ames test \n Cytotoxicity against human Huh5-2 cells carrying HCV genotype 1b I389luc-ubi-neo/NS3-3'/5.1 replicon assessed as cell morphological changes after 72 hrs by microscopic analysis \n Genotoxicity in Salmonella typhimurium TAMix by Ames test in absence of S9 fractions \n Genotoxicity in Salmonella typhimurium TA98 by Ames test in absence of S9 fractions \n Genotoxicity in Salmonella Typhimurium by Ames test \n Mutagenic activity in Salmonella Typhimurium TA98 using Ames test; Activity is log of revertants / nmol; a= inactive \n Cytotoxicity against Parkinson's disease patient derived human ONS cells assessed as effect on cell morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay \n Cytotoxicity against Parkinson's disease patient derived human ONS cells assessed as effect on nucleus morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay \n Inhibition of yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex using tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT \n Inhibition of recombinant histone acetyltransferase p300 (unknown origin) using dH3-H4 tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT \n Antibacterial activity against Escherichia coli ATCC 11229 assessed as diameter of the inhibition zone at 1 uM/disk after 4 hrs by disk diffusion method \n Substrate activity at N-terminal His6-tagged recombinant Paramecium bursaria chlorella virus 1 CPH expressed in Escherichia coli Rosetta 2 (DE3) cells assessed as substrate hydroxylation at 100 uM incubated for 16 hrs in presence of 2OG by MALDI TOF MS analysis \n Mutagenic activity in Salmonella Typhimurium TA98 assessed as number of revertants after 48 hrs by Ames test \n Mutagenic activity in Salmonella Typhimurium TA100 assessed as number of revertants after 48 hrs by Ames test \n Antifungal activity against Candida maltosa SBUG 700 assessed as diameter of the inhibition zone at 1 uM/disk by disk diffusion method'

Output:
```python\n{{\'index\': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], \'original sentence\': [\'Antibacterial activity against drug-resistant Enterococcus faecalis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Escherichia coli clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Proteus mirabilis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Klebsiella pneumoniae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Enterobacter cloacae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Genotoxicity in Salmonella typhimurium TA98 by Ames test\', "Cytotoxicity against human Huh5-2 cells carrying HCV genotype 1b I389luc-ubi-neo/NS3-3\'/5.1 replicon assessed as cell morphological changes after 72 hrs by microscopic analysis", \'Genotoxicity in Salmonella typhimurium TAMix by Ames test in absence of S9 fractions\', \'Genotoxicity in Salmonella typhimurium TA98 by Ames test in absence of S9 fractions\', \'Genotoxicity in Salmonella Typhimurium by Ames test\', \'Mutagenic activity in Salmonella Typhimurium TA98 using Ames test; Activity is log of revertants / nmol; a= inactive\', "Cytotoxicity against Parkinson\'s disease patient derived human ONS cells assessed as effect on cell morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay", "Cytotoxicity against Parkinson\'s disease patient derived human ONS cells assessed as effect on nucleus morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay", \'Inhibition of yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex using tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT\', \'Inhibition of recombinant histone acetyltransferase p300 (unknown origin) using dH3-H4 tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT\', \'Antibacterial activity against Escherichia coli ATCC 11229 assessed as diameter of the inhibition zone at 1 uM/disk after 4 hrs by disk diffusion method\', \'Substrate activity at N-terminal His6-tagged recombinant Paramecium bursaria chlorella virus 1 CPH expressed in Escherichia coli Rosetta 2 (DE3) cells assessed as substrate hydroxylation at 100 uM incubated for 16 hrs in presence of 2OG by MALDI TOF MS analysis\', \'Mutagenic activity in Salmonella Typhimurium TA98 assessed as number of revertants after 48 hrs by Ames test\', \'Mutagenic activity in Salmonella Typhimurium TA100 assessed as number of revertants after 48 hrs by Ames test\', \'Antifungal activity against Candida maltosa SBUG 700 assessed as diameter of the inhibition zone at 1 uM/disk by disk diffusion method\'], \'Concentration/Dosage Levels\': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, \'log of revertants / nmol\', \'10 uM\', \'10 uM\', \'1 mM DTT\', \'1 mM DTT\', \'1 uM/disk\', \'100 uM\', nan, nan, \'1 uM/disk\'], \'Temperature Range\': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], \'Time Duration\': [\'24 hrs\', \'24 hrs\', \'24 hrs\', \'24 hrs\', \'24 hrs\', nan, \'72 hrs\', nan, nan, nan, nan, \'24 hrs\', \'24 hrs\', \'30 mins\', \'30 mins\', \'4 hrs\', \'16 hrs\', \'48 hrs\', \'48 hrs\', nan], \'Metabolic Activation Presence\': [nan, nan, nan, nan, nan, nan, nan, \'absence of S9 fractions\', \'absence of S9 fractions\', nan, nan, nan, nan, nan, nan, nan, \'2OG\', nan, nan, nan], \'Cell/Tissue Type or Organism Used\': [\'Enterococcus faecalis\', \'Escherichia coli\', \'Proteus mirabilis\', \'Klebsiella pneumoniae\', \'Enterobacter cloacae\', \'Salmonella typhimurium TA98\', \'human Huh5-2 cells\', \'Salmonella typhimurium TAMix\', \'Salmonella typhimurium TA98\', \'Salmonella Typhimurium\', \'Salmonella Typhimurium TA98\', "Parkinson\'s disease patient derived human ONS cells", "Parkinson\'s disease patient derived human ONS cells", \'yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex\', \'recombinant histone acetyltransferase p300\', \'Escherichia coli ATCC 11229\', \'Escherichia coli Rosetta 2 (DE3) cells\', \'Salmonella Typhimurium TA98\', \'Salmonella Typhimurium TA100\', \'Candida maltosa SBUG 700\'], \'Ames experiment\': [False, False, False, False, False, True, False, True, True, True, True, False, False, False, False, False, False, True, True, False]}}\n```

"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/raw_data/ames/chembl_ames_raw_data.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = ['Concentration/Dosage Levels',
 'Temperature Range',
 'Time Duration',
 'Metabolic Activation Presence (e.g., S9 fraction in Ames test)',
 'Cell/Tissue Type or Organism Used']

experimental_conditions_list = ', '.join(experimental_conditions_list)

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (HEL cells) \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (Ba/F3 cells) \n Genotoxicity in Salmonella typhimurium by Ames test \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: Ba/F3 Cytotoxicity \n Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: JAK2V617F Inhibition \n Mutagenicity in Salmonella typhimurium by Ames test \n Induction of phospholipidosis in bovine corneal fibroblasts assessed as lamellar inclusion bodies after 72 hrs by light microscopy \n Neuroprotective activity against kainate-induced cell damage in mouse primary cortical neurons assessed as reduction in kainate-induced neurofilament loss by measuring increase in MAPK level at 1 uM pretreated followed by kainate challenge measured after 18 hrs by fluorescence assay \n Antiviral activity against HCV infected in human HuH5.2 cells assessed as inhibition of subgenomic RNA replication after 4 days by Steady-Glo luciferase assay \n Antibacterial activity against trimethoprim-sulfamethoxazole-resistant Stenotrophomonas maltophilia assessed as inhibition of microbial growth up to 50 ug/ml incubated for 24 hrs by microdilution assay \n Mutagenicity in Salmonella typhimurium TA100 in presence of liver S9 fraction by AMES test \n Mutagenicity in Salmonella typhimurium TA98 in presence of liver S9 fraction by AMES test \n Antiviral activity against Vaccinia virus WR in BSC1 cells assessed as retention of antiviral potency by measuring exogenous double stranded DNA trapping \n Genotoxicity in Salmonella typhimurium TA98 by Ames test in presence of S9 fractions \n Genotoxicity in Salmonella typhimurium TAMix by Ames test in presence of S9 fractions \n Mutagenicity in Salmonella typhimurium TA98 by AMES test \n Mutagenicity in Salmonella typhimurium TA100 by AMES test \n Antibacterial activity against drug-resistant Pseudomonas aeruginosa clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Acinetobacter calcoaceticus clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Serratia marcescens clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (HEL cells)', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: irf1 Inhibition (Ba/F3 cells)', 'Genotoxicity in Salmonella typhimurium by Ames test', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: Ba/F3 Cytotoxicity', 'Late stage results from the probe development effort to identify inhibitors of the Janus kinase 2 mutant JAK2V617F. (Class of assay: screening) [Related pubchem assays (depositor defined):AID1446, AID1486, AID1520, AID1521, AID1691, AID1699, AID1797] Panel member name: JAK2V617F Inhibition', 'Mutagenicity in Salmonella typhimurium by Ames test', 'Induction of phospholipidosis in bovine corneal fibroblasts assessed as lamellar inclusion bodies after 72 hrs by light microscopy', 'Neuroprotective activity against kainate-induced cell damage in mouse primary cortical neurons assessed as reduction in kainate-induced neurofilament loss by measuring increase in MAPK level at 1 uM pretreated followed by kainate challenge measured after 18 hrs by fluorescence assay', 'Antiviral activity against HCV infected in human HuH5.2 cells assessed as inhibition of subgenomic RNA replication after 4 days by Steady-Glo luciferase assay', 'Antibacterial activity against trimethoprim-sulfamethoxazole-resistant Stenotrophomonas maltophilia assessed as inhibition of microbial growth up to 50 ug/ml incubated for 24 hrs by microdilution assay', 'Mutagenicity in Salmonella typhimurium TA100 in presence of liver S9 fraction by AMES test', 'Mutagenicity in Salmonella typhimurium TA98 in presence of liver S9 fraction by AMES test', 'Antiviral activity against Vaccinia virus WR in BSC1 cells assessed as retention of antiviral potency by measuring exogenous double stranded DNA trapping', 'Genotoxicity in Salmonella typhimurium TA98 by Ames test in presence of S9 fractions', 'Genotoxicity in Salmonella typhimurium TAMix by Ames test in presence of S9 fractions', 'Mutagenicity in Salmonella typhimurium TA98 by AMES test', 'Mutagenicity in Salmonella typhimurium TA100 by AMES test', 'Antibacterial activity against drug-resistant Pseudomonas aeruginosa clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test', 'Antibacterial activity against drug-resistant Acinetobacter calcoaceticus clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test', 'Antibacterial activity against drug-resistant Serratia marcescens clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test'], 'Concentration/Dosage Levels': [nan, nan, nan, nan, nan, nan, nan, '1 uM', nan, 'up to 50 ug/ml', nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'Time Duration': [nan, nan, nan, nan, nan, nan, '72 hrs', '18 hrs', '4 days', '24 hrs', nan, nan, nan, nan, nan, nan, nan, '24 hrs', '24 hrs', '24 hrs'], 'Metabolic Activation Presence': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'liver S9 fraction', 'liver S9 fraction', nan, 'S9 fractions', 'S9 fractions', nan, nan, nan, nan, nan], 'Cell/Tissue Type or Organism Used': ['HEL cells', 'Ba/F3 cells', 'Salmonella typhimurium', 'Ba/F3', nan, 'Salmonella typhimurium', 'bovine corneal fibroblasts', 'mouse primary cortical neurons', 'human HuH5.2 cells', 'Stenotrophomonas maltophilia', 'Salmonella typhimurium TA100', 'Salmonella typhimurium TA98', 'BSC1 cells', 'Salmonella typhimurium TA98', 'Salmonella typhimurium TAMix', 'Salmonella typhimurium TA98', 'Salmonella typhimurium TA100', 'Pseudomonas aeruginosa', 'Acinetobacter calcoaceticus', 'Serratia marcescens'], 'Ames experiment': [False, False, True, False, False, True, False, False, False, False, True, True, False, True, True, True, True, False, False, False]}}\n```

Example 2
Input: 'Antibacterial activity against drug-resistant Enterococcus faecalis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Escherichia coli clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Proteus mirabilis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Klebsiella pneumoniae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Antibacterial activity against drug-resistant Enterobacter cloacae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test \n Genotoxicity in Salmonella typhimurium TA98 by Ames test \n Cytotoxicity against human Huh5-2 cells carrying HCV genotype 1b I389luc-ubi-neo/NS3-3'/5.1 replicon assessed as cell morphological changes after 72 hrs by microscopic analysis \n Genotoxicity in Salmonella typhimurium TAMix by Ames test in absence of S9 fractions \n Genotoxicity in Salmonella typhimurium TA98 by Ames test in absence of S9 fractions \n Genotoxicity in Salmonella Typhimurium by Ames test \n Mutagenic activity in Salmonella Typhimurium TA98 using Ames test; Activity is log of revertants / nmol; a= inactive \n Cytotoxicity against Parkinson's disease patient derived human ONS cells assessed as effect on cell morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay \n Cytotoxicity against Parkinson's disease patient derived human ONS cells assessed as effect on nucleus morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay \n Inhibition of yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex using tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT \n Inhibition of recombinant histone acetyltransferase p300 (unknown origin) using dH3-H4 tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT \n Antibacterial activity against Escherichia coli ATCC 11229 assessed as diameter of the inhibition zone at 1 uM/disk after 4 hrs by disk diffusion method \n Substrate activity at N-terminal His6-tagged recombinant Paramecium bursaria chlorella virus 1 CPH expressed in Escherichia coli Rosetta 2 (DE3) cells assessed as substrate hydroxylation at 100 uM incubated for 16 hrs in presence of 2OG by MALDI TOF MS analysis \n Mutagenic activity in Salmonella Typhimurium TA98 assessed as number of revertants after 48 hrs by Ames test \n Mutagenic activity in Salmonella Typhimurium TA100 assessed as number of revertants after 48 hrs by Ames test \n Antifungal activity against Candida maltosa SBUG 700 assessed as diameter of the inhibition zone at 1 uM/disk by disk diffusion method'

Output:
```python\n{{\'index\': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], \'original sentence\': [\'Antibacterial activity against drug-resistant Enterococcus faecalis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Escherichia coli clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Proteus mirabilis clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Klebsiella pneumoniae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Antibacterial activity against drug-resistant Enterobacter cloacae clinical isolate assessed as inhibition zone diameter after 24 hrs by antibacterial susceptibility test\', \'Genotoxicity in Salmonella typhimurium TA98 by Ames test\', "Cytotoxicity against human Huh5-2 cells carrying HCV genotype 1b I389luc-ubi-neo/NS3-3\'/5.1 replicon assessed as cell morphological changes after 72 hrs by microscopic analysis", \'Genotoxicity in Salmonella typhimurium TAMix by Ames test in absence of S9 fractions\', \'Genotoxicity in Salmonella typhimurium TA98 by Ames test in absence of S9 fractions\', \'Genotoxicity in Salmonella Typhimurium by Ames test\', \'Mutagenic activity in Salmonella Typhimurium TA98 using Ames test; Activity is log of revertants / nmol; a= inactive\', "Cytotoxicity against Parkinson\'s disease patient derived human ONS cells assessed as effect on cell morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay", "Cytotoxicity against Parkinson\'s disease patient derived human ONS cells assessed as effect on nucleus morphology parameters at 10 uM after 24 hrs by DAPI staining based fluorescence assay", \'Inhibition of yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex using tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT\', \'Inhibition of recombinant histone acetyltransferase p300 (unknown origin) using dH3-H4 tetramer and [3H]-acetyl-CoA assessed as acetate incorporation after 30 mins by liquid scintillation counting in presence of 1 mM DTT\', \'Antibacterial activity against Escherichia coli ATCC 11229 assessed as diameter of the inhibition zone at 1 uM/disk after 4 hrs by disk diffusion method\', \'Substrate activity at N-terminal His6-tagged recombinant Paramecium bursaria chlorella virus 1 CPH expressed in Escherichia coli Rosetta 2 (DE3) cells assessed as substrate hydroxylation at 100 uM incubated for 16 hrs in presence of 2OG by MALDI TOF MS analysis\', \'Mutagenic activity in Salmonella Typhimurium TA98 assessed as number of revertants after 48 hrs by Ames test\', \'Mutagenic activity in Salmonella Typhimurium TA100 assessed as number of revertants after 48 hrs by Ames test\', \'Antifungal activity against Candida maltosa SBUG 700 assessed as diameter of the inhibition zone at 1 uM/disk by disk diffusion method\'], \'Concentration/Dosage Levels\': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, \'log of revertants / nmol\', \'10 uM\', \'10 uM\', \'1 mM DTT\', \'1 mM DTT\', \'1 uM/disk\', \'100 uM\', nan, nan, \'1 uM/disk\'], \'Temperature Range\': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], \'Time Duration\': [\'24 hrs\', \'24 hrs\', \'24 hrs\', \'24 hrs\', \'24 hrs\', nan, \'72 hrs\', nan, nan, nan, nan, \'24 hrs\', \'24 hrs\', \'30 mins\', \'30 mins\', \'4 hrs\', \'16 hrs\', \'48 hrs\', \'48 hrs\', nan], \'Metabolic Activation Presence\': [nan, nan, nan, nan, nan, nan, nan, \'absence of S9 fractions\', \'absence of S9 fractions\', nan, nan, nan, nan, nan, nan, nan, \'2OG\', nan, nan, nan], \'Cell/Tissue Type or Organism Used\': [\'Enterococcus faecalis\', \'Escherichia coli\', \'Proteus mirabilis\', \'Klebsiella pneumoniae\', \'Enterobacter cloacae\', \'Salmonella typhimurium TA98\', \'human Huh5-2 cells\', \'Salmonella typhimurium TAMix\', \'Salmonella typhimurium TA98\', \'Salmonella Typhimurium\', \'Salmonella Typhimurium TA98\', "Parkinson\'s disease patient derived human ONS cells", "Parkinson\'s disease patient derived human ONS cells", \'yeast histone acetyltransferase Gcn5-Ada2-Ada3 complex\', \'recombinant histone acetyltransferase p300\', \'Escherichia coli ATCC 11229\', \'Escherichia coli Rosetta 2 (DE3) cells\', \'Salmonella Typhimurium TA98\', \'Salmonella Typhimurium TA100\', \'Candida maltosa SBUG 700\'], \'Ames experiment\': [False, False, False, False, False, True, False, True, True, True, True, False, False, False, False, False, False, True, True, False]}}\n```

"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/{prop}/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
root_path = '../data/data_mining_results/Ames/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence', 'Concentration/Dosage Levels',
       'Temperature Range', 'Time Duration', 'Metabolic Activation Presence',
       'Cell/Tissue Type or Organism Used', 'Ames experiment']]

In [None]:
result.columns = ['Assay Description', 'Concentration/Dosage Levels',
       'Temperature Range', 'Time Duration', 'Metabolic Activation Presence',
       'Cell/Tissue Type or Organism Used', 'Ames experiment']

In [None]:
df = pd.read_csv('../data/raw_data/ames/chembl_ames_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year','Comment']
df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')
df

In [None]:
df.to_csv('../data/data_mining_results/Ames/chembl_ames_data_mining.csv',index=False)