In [None]:

import openai
from tqdm.notebook import tqdm
import pandas as pd
import os
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2


In [None]:

# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'water_solubility'

In [None]:
assistant = client.beta.assistants.create(name='knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/solubility/chembl_sol_raw_data.csv')
df = df.drop_duplicates('Assay Description')
condition = ' \n '.join(df['Assay Description'][0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}


Please analyze the document and return a list of the top five most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Examples Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
df = pd.read_csv('../data/raw_data/solubility/chembl_sol_raw_data.csv')

In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,20)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/water_solubility/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/water_solubility/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))



f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Aqueous solubility of the compound \n ASTRAZENECA: Solubility in pH7.4 buffer using solid starting material using the method described in J. Assoc. Lab. Autom. 2011, 16, 276-284. Experimental range 0.10 to 1500 uM \n Solubility of the compound \n Aqueous solubility of the compound at pH 7.4 \n Kinetic solubility of the compound \n Solubility in water \n Aqueous solubility at pH 7.4 \n Solubility at pH 7.4 \n SUPPLEMENTARY: Lyophilisation Solubility Assay (LYSA) \n Aqueous solubility of compound \n Kinetic aqueous solubility of the compound \n Solubility of the compound at pH 7.4 \n Kinetic solubility of compound \n Solubility of the compound in water \n Solubility of the compound at pH 7 \n Aqueous solubility \n Kinetic solubility of the compound at pH 7.4 \n Solubility of the compound at pH 6.8 \n Aqueous solubility of compound at pH 7.4 \n Solubility of compound in PBS pH 7.4 incubated for 2 hrs by spectrophotometry'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'original sentence': ['Aqueous solubility of the compound', 'ASTRAZENECA: Solubility in pH7.4 buffer using solid starting material using the method described in J. Assoc. Lab. Autom. 2011, 16, 276-284. Experimental range 0.10 to 1500 uM', 'Solubility of the compound', 'Aqueous solubility of the compound at pH 7.4', 'Kinetic solubility of the compound', 'Solubility in water', 'Aqueous solubility at pH 7.4', 'Solubility at pH 7.4', 'SUPPLEMENTARY: Lyophilisation Solubility Assay (LYSA)', 'Aqueous solubility of compound', 'Kinetic aqueous solubility of the compound', 'Solubility of the compound at pH 7.4', 'Kinetic solubility of compound', 'Solubility of the compound in water', 'Solubility of the compound at pH 7', 'Aqueous solubility', 'Kinetic solubility of the compound at pH 7.4', 'Solubility of the compound at pH 6.8', 'Aqueous solubility of compound at pH 7.4', 'Solubility of compound in PBS pH 7.4 incubated for 2 hrs by spectrophotometry'], 'pH Level': [nan, 7.4, nan, 7.4, nan, nan, 7.4, 7.4, nan, nan, nan, 7.4, nan, nan, 7.0, nan, 7.4, 6.8, 7.4, 7.4], 'Solvent/System Composition': ['water', 'buffer', nan, 'water', nan, 'water', 'water', nan, nan, 'water', 'water', nan, nan, 'water', nan, 'water', nan, nan, 'water', 'PBS'], 'Time Period': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '2 hrs'], 'Measurement Technique': [nan, 'method described in J. Assoc. Lab. Autom. 2011, 16, 276-284', nan, nan, 'kinetic solubility', nan, nan, nan, 'Lyophilisation Solubility Assay (LYSA)', nan, 'kinetic solubility', nan, 'kinetic solubility', nan, nan, nan, 'kinetic solubility', nan, nan, 'spectrophotometry'], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'water_solubility': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Solubility (buffer pH 7.4) \n Thermodynamic aqueous solubility of the compound \n GSK_TB: GSK in-house kinetic solubility assay. DMSO stock solution diluted to 100 ml with pH 7.4 phosphate buffered saline. \n Aqueous solubility of the compound at pH 7 \n Thermodynamic solubility of the compound at pH 7.4 \n Solubility of the compound at pH 2 \n Aqueous solubility in phosphate buffered saline by multi-screen solubility assay \n Solubility in sodium phosphate buffer at pH 7.4 \n Aqueous solubility of the compound at pH 6.8 \n Solubility of compound in water \n Solubility of compound at pH 7.4 \n Solubility in 100 mM glycylglycine buffer at pH 4.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM glycylglycine buffer at pH 3.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 6.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 6.5 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 7.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 8.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM glycylglycine buffer at pH 5.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 7.5 incubated for 18 hrs by UPLC analysis \n Solubility of compound'

Output:
```python\n{{'index': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'original sentence': ['Solubility (buffer pH 7.4)', 'Thermodynamic aqueous solubility of the compound', 'GSK_TB: GSK in-house kinetic solubility assay. DMSO stock solution diluted to 100 ml with pH 7.4 phosphate buffered saline.', 'Aqueous solubility of the compound at pH 7', 'Thermodynamic solubility of the compound at pH 7.4', 'Solubility of the compound at pH 2', 'Aqueous solubility in phosphate buffered saline by multi-screen solubility assay', 'Solubility in sodium phosphate buffer at pH 7.4', 'Aqueous solubility of the compound at pH 6.8', 'Solubility of compound in water', 'Solubility of compound at pH 7.4', 'Solubility in 100 mM glycylglycine buffer at pH 4.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM glycylglycine buffer at pH 3.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 6.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 6.5 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 7.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 8.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM glycylglycine buffer at pH 5.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 7.5 incubated for 18 hrs by UPLC analysis', 'Solubility of compound'], 'pH Level': [7.4, nan, 7.4, 7.0, 7.4, 2.0, nan, 7.4, 6.8, nan, 7.4, 4.0, 3.0, 6.0, 6.5, 7.0, 8.0, 5.0, 7.5, nan], 'Solvent/System Composition': ['buffer', 'water', 'DMSO stock solution diluted with pH 7.4 phosphate buffered saline', 'water', nan, nan, 'phosphate buffered saline', 'sodium phosphate buffer', 'water', 'water', nan, '100 mM glycylglycine buffer', '100 mM glycylglycine buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM glycylglycine buffer', '100 mM bistrispropane buffer', nan], 'Time Period': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', nan], 'Measurement Technique': [nan, 'thermodynamic solubility', 'kinetic solubility', nan, 'thermodynamic solubility', nan, 'multi-screen solubility assay', nan, nan, nan, nan, 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'water_solubility': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/raw_data/solubility/chembl_sol_raw_data.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = ['pH Level',
 'Solvent/System Composition',
 'Time Period',
 'Measurement Technique',
 'Temperature Range']

experimental_conditions_list = ', '.join(experimental_conditions_list)

In [None]:
experimental_conditions_list

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Aqueous solubility of the compound \n ASTRAZENECA: Solubility in pH7.4 buffer using solid starting material using the method described in J. Assoc. Lab. Autom. 2011, 16, 276-284. Experimental range 0.10 to 1500 uM \n Solubility of the compound \n Aqueous solubility of the compound at pH 7.4 \n Kinetic solubility of the compound \n Solubility in water \n Aqueous solubility at pH 7.4 \n Solubility at pH 7.4 \n SUPPLEMENTARY: Lyophilisation Solubility Assay (LYSA) \n Aqueous solubility of compound \n Kinetic aqueous solubility of the compound \n Solubility of the compound at pH 7.4 \n Kinetic solubility of compound \n Solubility of the compound in water \n Solubility of the compound at pH 7 \n Aqueous solubility \n Kinetic solubility of the compound at pH 7.4 \n Solubility of the compound at pH 6.8 \n Aqueous solubility of compound at pH 7.4 \n Solubility of compound in PBS pH 7.4 incubated for 2 hrs by spectrophotometry'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'original sentence': ['Aqueous solubility of the compound', 'ASTRAZENECA: Solubility in pH7.4 buffer using solid starting material using the method described in J. Assoc. Lab. Autom. 2011, 16, 276-284. Experimental range 0.10 to 1500 uM', 'Solubility of the compound', 'Aqueous solubility of the compound at pH 7.4', 'Kinetic solubility of the compound', 'Solubility in water', 'Aqueous solubility at pH 7.4', 'Solubility at pH 7.4', 'SUPPLEMENTARY: Lyophilisation Solubility Assay (LYSA)', 'Aqueous solubility of compound', 'Kinetic aqueous solubility of the compound', 'Solubility of the compound at pH 7.4', 'Kinetic solubility of compound', 'Solubility of the compound in water', 'Solubility of the compound at pH 7', 'Aqueous solubility', 'Kinetic solubility of the compound at pH 7.4', 'Solubility of the compound at pH 6.8', 'Aqueous solubility of compound at pH 7.4', 'Solubility of compound in PBS pH 7.4 incubated for 2 hrs by spectrophotometry'], 'pH Level': [nan, 7.4, nan, 7.4, nan, nan, 7.4, 7.4, nan, nan, nan, 7.4, nan, nan, 7.0, nan, 7.4, 6.8, 7.4, 7.4], 'Solvent/System Composition': ['water', 'buffer', nan, 'water', nan, 'water', 'water', nan, nan, 'water', 'water', nan, nan, 'water', nan, 'water', nan, nan, 'water', 'PBS'], 'Time Period': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '2 hrs'], 'Measurement Technique': [nan, 'method described in J. Assoc. Lab. Autom. 2011, 16, 276-284', nan, nan, 'kinetic solubility', nan, nan, nan, 'Lyophilisation Solubility Assay (LYSA)', nan, 'kinetic solubility', nan, 'kinetic solubility', nan, nan, nan, 'kinetic solubility', nan, nan, 'spectrophotometry'], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'water_solubility': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

Example 2
Input: 'Solubility (buffer pH 7.4) \n Thermodynamic aqueous solubility of the compound \n GSK_TB: GSK in-house kinetic solubility assay. DMSO stock solution diluted to 100 ml with pH 7.4 phosphate buffered saline. \n Aqueous solubility of the compound at pH 7 \n Thermodynamic solubility of the compound at pH 7.4 \n Solubility of the compound at pH 2 \n Aqueous solubility in phosphate buffered saline by multi-screen solubility assay \n Solubility in sodium phosphate buffer at pH 7.4 \n Aqueous solubility of the compound at pH 6.8 \n Solubility of compound in water \n Solubility of compound at pH 7.4 \n Solubility in 100 mM glycylglycine buffer at pH 4.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM glycylglycine buffer at pH 3.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 6.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 6.5 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 7.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 8.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM glycylglycine buffer at pH 5.0 incubated for 18 hrs by UPLC analysis \n Solubility in 100 mM bistrispropane buffer at pH 7.5 incubated for 18 hrs by UPLC analysis \n Solubility of compound'

Output:
```python\n{{'index': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'original sentence': ['Solubility (buffer pH 7.4)', 'Thermodynamic aqueous solubility of the compound', 'GSK_TB: GSK in-house kinetic solubility assay. DMSO stock solution diluted to 100 ml with pH 7.4 phosphate buffered saline.', 'Aqueous solubility of the compound at pH 7', 'Thermodynamic solubility of the compound at pH 7.4', 'Solubility of the compound at pH 2', 'Aqueous solubility in phosphate buffered saline by multi-screen solubility assay', 'Solubility in sodium phosphate buffer at pH 7.4', 'Aqueous solubility of the compound at pH 6.8', 'Solubility of compound in water', 'Solubility of compound at pH 7.4', 'Solubility in 100 mM glycylglycine buffer at pH 4.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM glycylglycine buffer at pH 3.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 6.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 6.5 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 7.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 8.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM glycylglycine buffer at pH 5.0 incubated for 18 hrs by UPLC analysis', 'Solubility in 100 mM bistrispropane buffer at pH 7.5 incubated for 18 hrs by UPLC analysis', 'Solubility of compound'], 'pH Level': [7.4, nan, 7.4, 7.0, 7.4, 2.0, nan, 7.4, 6.8, nan, 7.4, 4.0, 3.0, 6.0, 6.5, 7.0, 8.0, 5.0, 7.5, nan], 'Solvent/System Composition': ['buffer', 'water', 'DMSO stock solution diluted with pH 7.4 phosphate buffered saline', 'water', nan, nan, 'phosphate buffered saline', 'sodium phosphate buffer', 'water', 'water', nan, '100 mM glycylglycine buffer', '100 mM glycylglycine buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM bistrispropane buffer', '100 mM glycylglycine buffer', '100 mM bistrispropane buffer', nan], 'Time Period': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', '18 hrs', nan], 'Measurement Technique': [nan, 'thermodynamic solubility', 'kinetic solubility', nan, 'thermodynamic solubility', nan, 'multi-screen solubility assay', nan, nan, nan, nan, 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', 'UPLC analysis', nan], 'Temperature Range': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'water_solubility': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]}}\n```

"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
len(assay_description)

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/solubility/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
root_path = '../data/data_mining_results/water_solubility/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence', 'pH Level', 'Solvent/System Composition',
       'Time Period', 'Measurement Technique', 'Temperature Range',
       'water_solubility', 'Water Solubility']]

In [None]:
result.columns = ['Assay Description', 'pH Level', 'Solvent/System Composition',
       'Time Period', 'Measurement Technique', 'Temperature Range',
       'water_solubility', 'Water Solubility']

In [None]:
df = pd.read_csv('../data/raw_data/solubility/chembl_sol_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year']

df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')
df

In [None]:
df.to_csv('../data/data_mining_results/water_solubility/chembl_solubility_data_mining.csv',index=False)