In [None]:

import openai
from tqdm.notebook import tqdm
import pandas as pd
import os
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2


In [None]:

# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'Plasma_Protein_Binding'

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/ppb/chembl_ppb_raw_data.csv')
df = df.drop_duplicates('Assay Description')
condition = ' \n '.join(df['Assay Description'][0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}


Please analyze the document and return a list of the top five most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
experimental_conditions = ['Species/Origin of Plasma or Serum',
 'Concentration of Tested Compound',
 'Duration of Incubation',
 'Analytical Detection Method',
 'Equilibrium Dialysis for Protein Binding Assessment']

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Example Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
df = pd.read_csv('../data/raw_data/ppb/chembl_ppb_raw_data.csv')


In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,20)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+20]
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/{prop}/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/{prop}/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))



f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Protein binding in human plasma \n Plasma protein binding in human \n Protein binding in mouse plasma \n Protein binding in rat plasma \n Plasma protein binding in rat \n Plasma protein binding in mouse \n Protein binding in plasma (unknown origin) \n Protein binding in human plasma after 18 hrs by LC/MS/MS based rapid equilibrium dialysis method \n Protein binding in human plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis \n Protein binding in rat plasma at 200 to 1000 ng/ml measured after 15 mins by UC-LC/MS/MS analysis \n The protein binding is expressed as percent bound as determined by VolSurf \n Protein binding in human plasma after 18 hrs by rapid equilibrium dialysis method \n Protein binding in dog plasma \n Percentage binding to plasma protein. \n Plasma protein binding in dog \n Plasma protein binding in mouse assessed as unbound fraction \n Protein binding in human plasma at 10 uM after 4 hrs by RF-MS analysis \n Protein binding in human plasma assessed as bound fraction \n Protein binding in human serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis \n Plasma protein binding in human assessed as unbound fraction'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Protein binding in human plasma', 'Plasma protein binding in human', 'Protein binding in mouse plasma', 'Protein binding in rat plasma', 'Plasma protein binding in rat', 'Plasma protein binding in mouse', 'Protein binding in plasma (unknown origin)', 'Protein binding in human plasma after 18 hrs by LC/MS/MS based rapid equilibrium dialysis method', 'Protein binding in human plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis', 'Protein binding in rat plasma at 200 to 1000 ng/ml measured after 15 mins by UC-LC/MS/MS analysis', 'The protein binding is expressed as percent bound as determined by VolSurf', 'Protein binding in human plasma after 18 hrs by rapid equilibrium dialysis method', 'Protein binding in dog plasma', 'Percentage binding to plasma protein.', 'Plasma protein binding in dog', 'Plasma protein binding in mouse assessed as unbound fraction', 'Protein binding in human plasma at 10 uM after 4 hrs by RF-MS analysis', 'Protein binding in human plasma assessed as bound fraction', 'Protein binding in human serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis', 'Plasma protein binding in human assessed as unbound fraction'], 'Species/Origin of Plasma or Serum': ['Human', 'Human', 'Mouse', 'Rat', 'Rat', 'Mouse', nan, 'Human', 'Human', 'Rat', nan, 'Human', 'Dog', nan, 'Dog', 'Mouse', 'Human', 'Human', 'Human', 'Human'], 'Concentration of Tested Compound': [nan, nan, nan, nan, nan, nan, nan, nan, '5 uM', '200 to 1000 ng/ml', nan, nan, nan, nan, nan, nan, '10 uM', nan, '1 ug/ml', nan], 'Duration of Incubation': [nan, nan, nan, nan, nan, nan, nan, '18 hrs', '5 hrs', '15 mins', nan, '18 hrs', nan, nan, nan, nan, '4 hrs', nan, '4 hrs', nan], 'Analytical Detection Method': [nan, nan, nan, nan, nan, nan, nan, 'LC/MS/MS', nan, 'UC-LC/MS/MS', 'VolSurf', nan, nan, nan, nan, nan, 'RF-MS', nan, 'LC-MS/MS', nan], 'Equilibrium Dialysis for Protein Binding Assessment': [nan, nan, nan, nan, nan, nan, nan, 'Rapid Equilibrium Dialysis', 'Rapid Equilibrium Dialysis', nan, nan, 'Rapid Equilibrium Dialysis', nan, nan, nan, nan, nan, nan, nan, nan], 'Plasma_Protein_Binding': [True, True, True, True, True, True, True, True, True, True, False, True, True, False, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Protein binding in mouse plasma assessed as bound fraction \n Protein binding in ICR mouse serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis \n Protein binding in mouse plasma by dialysis method \n Plasma protein binding in human incubated for 30 mins by HPLC-MS analysis \n Protein binding in human plasma by LC-MS/MS based equilibrium analysis \n Protein binding in human plasma by dialysis method \n Protein binding in rat plasma assessed as bound fraction \n Protein binding in mouse serum assessed as fraction unbound \n Plasma protein binding in CD-1 mouse incubated for 30 mins by HPLC-MS analysis \n Plasma protein binding in rat incubated for 24 hrs by LC-MS/MS analysis \n Plasma protein binding in human at 5 uM by HPLC analysis \n Plasma protein binding in rat at 1 uM \n Protein binding in mouse plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis based LC-MS/MS analysis \n Plasma protein binding in human incubated for 24 hrs by LC-MS/MS analysis \n Protein binding in human plasma at 1 uM \n Protein binding in human plasma measured after 4 hrs by LC-MS/MS analysis \n Plasma protein binding in human at 5 uM measured after 4 hrs by LC-MS/MS analysis \n Protein binding in human plasma at 1 uM by LC-MS/MS analysis \n Protein binding in mouse plasma at 2 uM by equilibrium dialysis relative to control \n Protein binding in human plasma at 2 uM by equilibrium dialysis relative to control'

Output:
```python\n{{'index': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'original sentence': ['Protein binding in mouse plasma assessed as bound fraction', 'Protein binding in ICR mouse serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis', 'Protein binding in mouse plasma by dialysis method', 'Plasma protein binding in human incubated for 30 mins by HPLC-MS analysis', 'Protein binding in human plasma by LC-MS/MS based equilibrium analysis', 'Protein binding in human plasma by dialysis method', 'Protein binding in rat plasma assessed as bound fraction', 'Protein binding in mouse serum assessed as fraction unbound', 'Plasma protein binding in CD-1 mouse incubated for 30 mins by HPLC-MS analysis', 'Plasma protein binding in rat incubated for 24 hrs by LC-MS/MS analysis', 'Plasma protein binding in human at 5 uM by HPLC analysis', 'Plasma protein binding in rat at 1 uM', 'Protein binding in mouse plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis based LC-MS/MS analysis', 'Plasma protein binding in human incubated for 24 hrs by LC-MS/MS analysis', 'Protein binding in human plasma at 1 uM', 'Protein binding in human plasma measured after 4 hrs by LC-MS/MS analysis', 'Plasma protein binding in human at 5 uM measured after 4 hrs by LC-MS/MS analysis', 'Protein binding in human plasma at 1 uM by LC-MS/MS analysis', 'Protein binding in mouse plasma at 2 uM by equilibrium dialysis relative to control', 'Protein binding in human plasma at 2 uM by equilibrium dialysis relative to control'], 'Species/Origin of Plasma or Serum': ['Mouse', 'Mouse', 'Mouse', 'Human', 'Human', 'Human', 'Rat', 'Mouse', 'Mouse', 'Rat', 'Human', 'Rat', 'Mouse', 'Human', 'Human', 'Human', 'Human', 'Human', 'Mouse', 'Human'], 'Concentration of Tested Compound': [nan, '1 ug/ml', nan, nan, nan, nan, nan, nan, nan, nan, '5 uM', '1 uM', '5 uM', nan, '1 uM', nan, '5 uM', '1 uM', '2 uM', '2 uM'], 'Duration of Incubation': [nan, '4 hrs', nan, '30 mins', nan, nan, nan, nan, '30 mins', '24 hrs', nan, nan, '5 hrs', '24 hrs', nan, '4 hrs', '4 hrs', nan, nan, nan], 'Analytical Detection Method': [nan, 'LC-MS/MS', nan, 'HPLC-MS', 'LC-MS/MS', nan, nan, nan, 'HPLC-MS', 'LC-MS/MS', 'HPLC', nan, 'LC-MS/MS', 'LC-MS/MS', nan, 'LC-MS/MS', 'LC-MS/MS', 'LC-MS/MS', nan, nan], 'Equilibrium Dialysis for Protein Binding Assessment': [nan, nan, 'Dialysis', nan, 'Equilibrium Analysis', 'Dialysis', nan, nan, nan, nan, nan, nan, 'Rapid Equilibrium Dialysis', nan, nan, nan, nan, nan, 'Equilibrium Dialysis', 'Equilibrium Dialysis'], 'Plasma_Protein_Binding': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/raw_data/ppb/chembl_ppb_raw_data.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = ['Species/Origin of Plasma or Serum',
 'Concentration of Tested Compound',
 'Duration of Incubation',
 'Analytical Detection Method',
 'Equilibrium Dialysis for Protein Binding Assessment']

experimental_conditions_list = ', '.join(experimental_conditions_list)
experimental_conditions_list

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Protein binding in human plasma \n Plasma protein binding in human \n Protein binding in mouse plasma \n Protein binding in rat plasma \n Plasma protein binding in rat \n Plasma protein binding in mouse \n Protein binding in plasma (unknown origin) \n Protein binding in human plasma after 18 hrs by LC/MS/MS based rapid equilibrium dialysis method \n Protein binding in human plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis \n Protein binding in rat plasma at 200 to 1000 ng/ml measured after 15 mins by UC-LC/MS/MS analysis \n The protein binding is expressed as percent bound as determined by VolSurf \n Protein binding in human plasma after 18 hrs by rapid equilibrium dialysis method \n Protein binding in dog plasma \n Percentage binding to plasma protein. \n Plasma protein binding in dog \n Plasma protein binding in mouse assessed as unbound fraction \n Protein binding in human plasma at 10 uM after 4 hrs by RF-MS analysis \n Protein binding in human plasma assessed as bound fraction \n Protein binding in human serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis \n Plasma protein binding in human assessed as unbound fraction'

Output:
```python\n{{'index': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20], 'original sentence': ['Protein binding in human plasma', 'Plasma protein binding in human', 'Protein binding in mouse plasma', 'Protein binding in rat plasma', 'Plasma protein binding in rat', 'Plasma protein binding in mouse', 'Protein binding in plasma (unknown origin)', 'Protein binding in human plasma after 18 hrs by LC/MS/MS based rapid equilibrium dialysis method', 'Protein binding in human plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis', 'Protein binding in rat plasma at 200 to 1000 ng/ml measured after 15 mins by UC-LC/MS/MS analysis', 'The protein binding is expressed as percent bound as determined by VolSurf', 'Protein binding in human plasma after 18 hrs by rapid equilibrium dialysis method', 'Protein binding in dog plasma', 'Percentage binding to plasma protein.', 'Plasma protein binding in dog', 'Plasma protein binding in mouse assessed as unbound fraction', 'Protein binding in human plasma at 10 uM after 4 hrs by RF-MS analysis', 'Protein binding in human plasma assessed as bound fraction', 'Protein binding in human serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis', 'Plasma protein binding in human assessed as unbound fraction'], 'Species/Origin of Plasma or Serum': ['Human', 'Human', 'Mouse', 'Rat', 'Rat', 'Mouse', nan, 'Human', 'Human', 'Rat', nan, 'Human', 'Dog', nan, 'Dog', 'Mouse', 'Human', 'Human', 'Human', 'Human'], 'Concentration of Tested Compound': [nan, nan, nan, nan, nan, nan, nan, nan, '5 uM', '200 to 1000 ng/ml', nan, nan, nan, nan, nan, nan, '10 uM', nan, '1 ug/ml', nan], 'Duration of Incubation': [nan, nan, nan, nan, nan, nan, nan, '18 hrs', '5 hrs', '15 mins', nan, '18 hrs', nan, nan, nan, nan, '4 hrs', nan, '4 hrs', nan], 'Analytical Detection Method': [nan, nan, nan, nan, nan, nan, nan, 'LC/MS/MS', nan, 'UC-LC/MS/MS', 'VolSurf', nan, nan, nan, nan, nan, 'RF-MS', nan, 'LC-MS/MS', nan], 'Equilibrium Dialysis for Protein Binding Assessment': [nan, nan, nan, nan, nan, nan, nan, 'Rapid Equilibrium Dialysis', 'Rapid Equilibrium Dialysis', nan, nan, 'Rapid Equilibrium Dialysis', nan, nan, nan, nan, nan, nan, nan, nan], 'Plasma_Protein_Binding': [True, True, True, True, True, True, True, True, True, True, False, True, True, False, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Protein binding in mouse plasma assessed as bound fraction \n Protein binding in ICR mouse serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis \n Protein binding in mouse plasma by dialysis method \n Plasma protein binding in human incubated for 30 mins by HPLC-MS analysis \n Protein binding in human plasma by LC-MS/MS based equilibrium analysis \n Protein binding in human plasma by dialysis method \n Protein binding in rat plasma assessed as bound fraction \n Protein binding in mouse serum assessed as fraction unbound \n Plasma protein binding in CD-1 mouse incubated for 30 mins by HPLC-MS analysis \n Plasma protein binding in rat incubated for 24 hrs by LC-MS/MS analysis \n Plasma protein binding in human at 5 uM by HPLC analysis \n Plasma protein binding in rat at 1 uM \n Protein binding in mouse plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis based LC-MS/MS analysis \n Plasma protein binding in human incubated for 24 hrs by LC-MS/MS analysis \n Protein binding in human plasma at 1 uM \n Protein binding in human plasma measured after 4 hrs by LC-MS/MS analysis \n Plasma protein binding in human at 5 uM measured after 4 hrs by LC-MS/MS analysis \n Protein binding in human plasma at 1 uM by LC-MS/MS analysis \n Protein binding in mouse plasma at 2 uM by equilibrium dialysis relative to control \n Protein binding in human plasma at 2 uM by equilibrium dialysis relative to control'

Output:
```python\n{{'index': [21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40], 'original sentence': ['Protein binding in mouse plasma assessed as bound fraction', 'Protein binding in ICR mouse serum at 1 ug/ml incubated for 4 hrs by LC-MS/MS analysis', 'Protein binding in mouse plasma by dialysis method', 'Plasma protein binding in human incubated for 30 mins by HPLC-MS analysis', 'Protein binding in human plasma by LC-MS/MS based equilibrium analysis', 'Protein binding in human plasma by dialysis method', 'Protein binding in rat plasma assessed as bound fraction', 'Protein binding in mouse serum assessed as fraction unbound', 'Plasma protein binding in CD-1 mouse incubated for 30 mins by HPLC-MS analysis', 'Plasma protein binding in rat incubated for 24 hrs by LC-MS/MS analysis', 'Plasma protein binding in human at 5 uM by HPLC analysis', 'Plasma protein binding in rat at 1 uM', 'Protein binding in mouse plasma at 5 uM incubated for 5 hrs by rapid equilibrium dialysis based LC-MS/MS analysis', 'Plasma protein binding in human incubated for 24 hrs by LC-MS/MS analysis', 'Protein binding in human plasma at 1 uM', 'Protein binding in human plasma measured after 4 hrs by LC-MS/MS analysis', 'Plasma protein binding in human at 5 uM measured after 4 hrs by LC-MS/MS analysis', 'Protein binding in human plasma at 1 uM by LC-MS/MS analysis', 'Protein binding in mouse plasma at 2 uM by equilibrium dialysis relative to control', 'Protein binding in human plasma at 2 uM by equilibrium dialysis relative to control'], 'Species/Origin of Plasma or Serum': ['Mouse', 'Mouse', 'Mouse', 'Human', 'Human', 'Human', 'Rat', 'Mouse', 'Mouse', 'Rat', 'Human', 'Rat', 'Mouse', 'Human', 'Human', 'Human', 'Human', 'Human', 'Mouse', 'Human'], 'Concentration of Tested Compound': [nan, '1 ug/ml', nan, nan, nan, nan, nan, nan, nan, nan, '5 uM', '1 uM', '5 uM', nan, '1 uM', nan, '5 uM', '1 uM', '2 uM', '2 uM'], 'Duration of Incubation': [nan, '4 hrs', nan, '30 mins', nan, nan, nan, nan, '30 mins', '24 hrs', nan, nan, '5 hrs', '24 hrs', nan, '4 hrs', '4 hrs', nan, nan, nan], 'Analytical Detection Method': [nan, 'LC-MS/MS', nan, 'HPLC-MS', 'LC-MS/MS', nan, nan, nan, 'HPLC-MS', 'LC-MS/MS', 'HPLC', nan, 'LC-MS/MS', 'LC-MS/MS', nan, 'LC-MS/MS', 'LC-MS/MS', 'LC-MS/MS', nan, nan], 'Equilibrium Dialysis for Protein Binding Assessment': [nan, nan, 'Dialysis', nan, 'Equilibrium Analysis', 'Dialysis', nan, nan, nan, nan, nan, nan, 'Rapid Equilibrium Dialysis', nan, nan, nan, nan, nan, 'Equilibrium Dialysis', 'Equilibrium Dialysis'], 'Plasma_Protein_Binding': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
len(assay_description)

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/{prop}/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
root_path = '../data/data_mining_results/Plasma_Protein_Binding/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence', 'Species/Origin of Plasma or Serum',
       'Concentration of Tested Compound', 'Duration of Incubation',
       'Analytical Detection Method',
       'Equilibrium Dialysis for Protein Binding Assessment',
       'Plasma_Protein_Binding']]

In [None]:
result.columns = ['Assay Description','Species/Origin of Plasma or Serum',
       'Concentration of Tested Compound', 'Duration of Incubation',
       'Analytical Detection Method',
       'Equilibrium Dialysis for Protein Binding Assessment',
       'Plasma_Protein_Binding']

In [None]:
df = pd.read_csv('../data/raw_data/ppb/chembl_ppb_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year']
df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')

In [None]:
df.to_csv('../data/data_mining_results/Plasma_Protein_Binding/ppb_Chembl_data_mining_finished.csv',index=False)