In [None]:
import openai
from tqdm.notebook import tqdm
import pandas as pd
import os
from IPython.display import display, Markdown
import pickle as pkl
pd.set_option('display.max_columns', 60)
import time

from datamining_utils import chatGPT_check_replay,chatGPT_replay,load_api_key_from_file,extract_code_script_from_markdown,find_dictionaries_in_string

%load_ext autoreload
%autoreload 2

In [None]:
# Usage
api_key = load_api_key_from_file()
from openai import OpenAI
client = OpenAI(api_key = api_key)

# Key Word Agent

In [None]:
prop = 'Blood_Brain_Barrier'

In [None]:
assistant = client.beta.assistants.create(name=f'{prop} knowledge generation chatbot',
  instructions=f"Please summarize the ADME-T related important experimental conditions",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": "summarise the key experimental conditions within the given experiments",
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

### summarize the important experimental conditions

In [None]:
df = pd.read_csv('../data/raw_data/bbb/chembl_bbb_raw_data.csv')
df = df.drop_duplicates('Assay Description')
condition = ' \n '.join(df['Assay Description'][0:50])

In [None]:
mes = f"""
Please summarize the key experimental conditions for experiments related to {prop} within the {condition}


Please analyze the document and return a list of the top five most frequently mentioned experimental condition categories. These should be summarized under broad categories, such as pH levels, temperature ranges, or other relevant general conditions, rather than specific values or detailed conditions. Present this information in a Python list format.

Each entry in the list should represent a unique category of experimental conditions. Avoid duplicating similar conditions and focus on capturing the overarching categories that these conditions fall under.

Example output:
```python
['pH Level', 'Temperature Range', 'Light Exposure']
```

Ensure that the list is comprehensive, covering all major categories of experimental conditions mentioned in the document.
"""

In [None]:
chatGPT_replay(client,thread.id, assistant, question_content=mes)

In [None]:
# time.sleep(3)
chatGPT_check_replay(client,thread)

thread_messages = client.beta.threads.messages.list(thread.id)
answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
experimental_conditions = eval(answer)
experimental_conditions

In [None]:
experimental_conditions = ['Cell Line Models',
 'Temperature Conditions',
 'Permeability Assays',
 'pH Levels',
 'Concentration and Dosing Parameters']

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','wb') as f:
    pkl.dump(experimental_conditions,f)

# Example Agent

In [None]:
with open(f'../data/data_mining_results/{prop}/{prop}_experimental_conditions_summaried_by_LLMs.pkl','rb') as f:
    experimental_conditions = pkl.load(f)

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
mes = """
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {}, and whether is {} experiment or not 
Fill in none if no information given. 
Please include all the sentences

""".format(', '.join(experimental_conditions),prop)

In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
df = pd.read_csv('../data/raw_data/bbb/chembl_bbb_raw_data.csv')

In [None]:
result = pd.DataFrame()
for i in tqdm(range(0,40,20)):
    info = ' \n '.join(df['Assay Description'].value_counts().keys()[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    runs = client.beta.threads.runs.list(
          thread.id)
    
    while runs.data[0].status != 'completed':
        time.sleep(3)
        runs = client.beta.threads.runs.list(
          thread.id
        )
    
    thread_messages = client.beta.threads.messages.list(thread.id)
    answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
    answer = pd.DataFrame(find_dictionaries_in_string(answer))
    

    display(pd.DataFrame(answer))
    result = pd.concat([result,pd.DataFrame(answer)])

In [None]:
result.to_csv(f'../data/data_mining_results/{prop}/example_{prop}.csv',index=False)

# Manul Validate and Create Full Promt with two shot Examples

In [None]:
examples = pd.read_csv(f'../data/data_mining_results/{prop}/example_{prop}_manuel_validated.csv')

In [None]:
tmp = examples[0:20]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))

In [None]:
tmp = examples[20::]

' \n '.join(tmp[ 'original sentence'].values)

In [None]:
str(tmp.to_dict(orient='list'))



f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Permeability of the compound by PAMPA \n Permeability of the compound at pH 7.4 by PAMPA \n SUPPLEMENTARY: PAMPA permeability assay \n Permeability of compound by PAMPA assay \n Permeability from apical to basolateral side in human Caco2 cells \n Permeability across apical to basolateral side in human Caco2 cells \n Permeability across human Caco2 cells \n Permeability from basolateral to apical side in human Caco2 cells \n Permeability at pH 7.4 by PAMPA method \n Permeability in human Caco2 cells \n Permeability from apical to basolateral side of human Caco2 cells \n Permeability of the compound at pH 7.4 by PAMPA assay \n Permeability of the compound by PAMPA method at pH 7.4 \n Permeability of the compound at pH 7.4 after 30 mins by PAMPA \n Membrane permeability by PAMPA \n Membrane permeability by PAMPA assay \n Permeability of compound by PAMPA \n Permeability of compound by PAMPA-BBB assay \n Passive transcellular permeability of the compound at pH 4 by PAMPA \n Passive transcellular permeability of the compound at pH 7.4 by PAMPA'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'original sentence': ['Permeability of the compound by PAMPA', 'Permeability of the compound at pH 7.4 by PAMPA', 'SUPPLEMENTARY: PAMPA permeability assay', 'Permeability of compound by PAMPA assay', 'Permeability from apical to basolateral side in human Caco2 cells', 'Permeability across apical to basolateral side in human Caco2 cells', 'Permeability across human Caco2 cells', 'Permeability from basolateral to apical side in human Caco2 cells', 'Permeability at pH 7.4 by PAMPA method', 'Permeability in human Caco2 cells', 'Permeability from apical to basolateral side of human Caco2 cells', 'Permeability of the compound at pH 7.4 by PAMPA assay', 'Permeability of the compound by PAMPA method at pH 7.4', 'Permeability of the compound at pH 7.4 after 30 mins by PAMPA', 'Membrane permeability by PAMPA', 'Membrane permeability by PAMPA assay', 'Permeability of compound by PAMPA', 'Permeability of compound by PAMPA-BBB assay', 'Passive transcellular permeability of the compound at pH 4 by PAMPA', 'Passive transcellular permeability of the compound at pH 7.4 by PAMPA'], 'Cell Line Models': ['PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'Caco2', 'Caco2', 'Caco2', 'Caco2', 'PAMPA', 'Caco2', 'Caco2', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA-BBB', 'PAMPA', 'PAMPA'], 'Temperature Conditions': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'Permeability Assays': ['None', 'None', 'None', 'None', 'apical to basolateral', 'apical to basolateral', 'None', 'basolateral to apical', 'PAMPA', 'None', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'pH Levels': ['None', '7.4', 'None', 'None', 'None', 'None', 'None', 'None', '7.4', 'None', 'None', '7.4', '7.4', '7.4', 'None', 'None', 'None', 'None', '4', '7.4'], 'Concentration and Dosing Parameters': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'after 30 mins', 'None', 'None', 'None', 'None', 'None', 'None'], 'Blood_Brain_Barrier': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]}}\n```

Example 2
Input: 'Permeability of compound at pH 7.4 by PAMPA assay \n Permeability across human Caco-2 cells \n Permeability of compound \n Permeability across basolateral to apical side in human Caco2 cells \n Permeability from apical to basolateral side in human Caco-2 cells \n Permeability of the compound by PAMPA assay \n Permeability of the compound \n Permeability of the compound at pH 7.4 incubated for 30 mins by UV plate reader based PAMPA assay \n Permeability of the compound by PAMPA assay at pH 7.4 \n Permeability in RRCK cells \n Permeability of compound at pH 7.4 by PAMPA \n Permeability in MDCK cells \n Permeability across apical to basolateral side in human Caco2 cells measured up to 2 hrs by LC-MS/MS analysis or scintillation counting method \n Passive permeability in MDCK cells at 2 uM after 90 mins by LC-MS/MS analysis \n Apparent permeability of compound at pH 7.4 by artificial membrane permeability assay \n Permeability across Caco-2 cell membrane \n Permeability of the compound at pH 5.5 by PAMPA \n Permeability of the compound by PAMPA-BBB assay \n Permeability of the compound at pH 5.5 by PAMPA assay \n Permeability of the compound at 10 uM by PAMPA'

Output:
```python\n{{'index': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'original sentence': ['Permeability of compound at pH 7.4 by PAMPA assay', 'Permeability across human Caco-2 cells', 'Permeability of compound', 'Permeability across basolateral to apical side in human Caco2 cells', 'Permeability from apical to basolateral side in human Caco-2 cells', 'Permeability of the compound by PAMPA assay', 'Permeability of the compound', 'Permeability of the compound at pH 7.4 incubated for 30 mins by UV plate reader based PAMPA assay', 'Permeability of the compound by PAMPA assay at pH 7.4', 'Permeability in RRCK cells', 'Permeability of compound at pH 7.4 by PAMPA', 'Permeability in MDCK cells', 'Permeability across apical to basolateral side in human Caco2 cells measured up to 2 hrs by LC-MS/MS analysis or scintillation counting method', 'Passive permeability in MDCK cells at 2 uM after 90 mins by LC-MS/MS analysis', 'Apparent permeability of compound at pH 7.4 by artificial membrane permeability assay', 'Permeability across Caco-2 cell membrane', 'Permeability of the compound at pH 5.5 by PAMPA', 'Permeability of the compound by PAMPA-BBB assay', 'Permeability of the compound at pH 5.5 by PAMPA assay', 'Permeability of the compound at 10 uM by PAMPA'], 'Cell Line Models': ['PAMPA', 'Caco-2', 'None', 'Caco2', 'Caco-2', 'PAMPA', 'None', 'PAMPA', 'PAMPA', 'RRCK', 'PAMPA', 'MDCK', 'Caco2', 'MDCK', 'PAMPA', 'Caco-2', 'PAMPA', 'PAMPA-BBB', 'PAMPA', 'PAMPA'], 'Temperature Conditions': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'Permeability Assays': ['None', 'None', 'None', 'basolateral to apical', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'pH Levels': ['7.4', 'None', 'None', 'None', 'None', 'None', 'None', '7.4', '7.4', 'None', '7.4', 'None', 'None', 'None', '7.4', 'None', '5.5', 'None', '5.5', 'None'], 'Concentration and Dosing Parameters': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'incubated for 30 mins', 'None', 'None', 'None', 'None', 'None', '2 uM', 'None', 'None', 'None', 'None', 'None', '10 uM'], 'Blood_Brain_Barrier': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]}}\n```

"""

# Data Mining Agent

### Load data 

In [None]:
df = pd.read_csv('../data/raw_data/bbb/chembl_bbb_raw_data.csv')
assay_description = df['Assay Description'].value_counts().keys()

### Create agent

In [None]:
assistant = client.beta.assistants.create(name='Data Mining chatbot',
  instructions=f"Please mining the key biomedical information within the given data",
  model="gpt-4-1106-preview",
)   

In [None]:
assistant.id

In [None]:
experimental_conditions_list = ['Cell Line Models',
 'Temperature Conditions',
 'Permeability Assays',
 'pH Levels',
 'Concentration and Dosing Parameters']

experimental_conditions_list = ', '.join(experimental_conditions_list)
experimental_conditions_list

In [None]:
mes = f"""
Biomedical data mining task.

Return python dictionary with key including index, original sentence, {experimental_conditions_list}, and whether is {prop} experiment or not 
Fill in none if no information given. 
Please don't ignore some sentences.

Example 1 
Input: 'Permeability of the compound by PAMPA \n Permeability of the compound at pH 7.4 by PAMPA \n SUPPLEMENTARY: PAMPA permeability assay \n Permeability of compound by PAMPA assay \n Permeability from apical to basolateral side in human Caco2 cells \n Permeability across apical to basolateral side in human Caco2 cells \n Permeability across human Caco2 cells \n Permeability from basolateral to apical side in human Caco2 cells \n Permeability at pH 7.4 by PAMPA method \n Permeability in human Caco2 cells \n Permeability from apical to basolateral side of human Caco2 cells \n Permeability of the compound at pH 7.4 by PAMPA assay \n Permeability of the compound by PAMPA method at pH 7.4 \n Permeability of the compound at pH 7.4 after 30 mins by PAMPA \n Membrane permeability by PAMPA \n Membrane permeability by PAMPA assay \n Permeability of compound by PAMPA \n Permeability of compound by PAMPA-BBB assay \n Passive transcellular permeability of the compound at pH 4 by PAMPA \n Passive transcellular permeability of the compound at pH 7.4 by PAMPA'

Output:
```python\n{{'index': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], 'original sentence': ['Permeability of the compound by PAMPA', 'Permeability of the compound at pH 7.4 by PAMPA', 'SUPPLEMENTARY: PAMPA permeability assay', 'Permeability of compound by PAMPA assay', 'Permeability from apical to basolateral side in human Caco2 cells', 'Permeability across apical to basolateral side in human Caco2 cells', 'Permeability across human Caco2 cells', 'Permeability from basolateral to apical side in human Caco2 cells', 'Permeability at pH 7.4 by PAMPA method', 'Permeability in human Caco2 cells', 'Permeability from apical to basolateral side of human Caco2 cells', 'Permeability of the compound at pH 7.4 by PAMPA assay', 'Permeability of the compound by PAMPA method at pH 7.4', 'Permeability of the compound at pH 7.4 after 30 mins by PAMPA', 'Membrane permeability by PAMPA', 'Membrane permeability by PAMPA assay', 'Permeability of compound by PAMPA', 'Permeability of compound by PAMPA-BBB assay', 'Passive transcellular permeability of the compound at pH 4 by PAMPA', 'Passive transcellular permeability of the compound at pH 7.4 by PAMPA'], 'Cell Line Models': ['PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'Caco2', 'Caco2', 'Caco2', 'Caco2', 'PAMPA', 'Caco2', 'Caco2', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA', 'PAMPA-BBB', 'PAMPA', 'PAMPA'], 'Temperature Conditions': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'Permeability Assays': ['None', 'None', 'None', 'None', 'apical to basolateral', 'apical to basolateral', 'None', 'basolateral to apical', 'PAMPA', 'None', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'pH Levels': ['None', '7.4', 'None', 'None', 'None', 'None', 'None', 'None', '7.4', 'None', 'None', '7.4', '7.4', '7.4', 'None', 'None', 'None', 'None', '4', '7.4'], 'Concentration and Dosing Parameters': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'after 30 mins', 'None', 'None', 'None', 'None', 'None', 'None'], 'Blood_Brain_Barrier': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]}}\n```

Example 2
Input: 'Permeability of compound at pH 7.4 by PAMPA assay \n Permeability across human Caco-2 cells \n Permeability of compound \n Permeability across basolateral to apical side in human Caco2 cells \n Permeability from apical to basolateral side in human Caco-2 cells \n Permeability of the compound by PAMPA assay \n Permeability of the compound \n Permeability of the compound at pH 7.4 incubated for 30 mins by UV plate reader based PAMPA assay \n Permeability of the compound by PAMPA assay at pH 7.4 \n Permeability in RRCK cells \n Permeability of compound at pH 7.4 by PAMPA \n Permeability in MDCK cells \n Permeability across apical to basolateral side in human Caco2 cells measured up to 2 hrs by LC-MS/MS analysis or scintillation counting method \n Passive permeability in MDCK cells at 2 uM after 90 mins by LC-MS/MS analysis \n Apparent permeability of compound at pH 7.4 by artificial membrane permeability assay \n Permeability across Caco-2 cell membrane \n Permeability of the compound at pH 5.5 by PAMPA \n Permeability of the compound by PAMPA-BBB assay \n Permeability of the compound at pH 5.5 by PAMPA assay \n Permeability of the compound at 10 uM by PAMPA'

Output:
```python\n{{'index': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39], 'original sentence': ['Permeability of compound at pH 7.4 by PAMPA assay', 'Permeability across human Caco-2 cells', 'Permeability of compound', 'Permeability across basolateral to apical side in human Caco2 cells', 'Permeability from apical to basolateral side in human Caco-2 cells', 'Permeability of the compound by PAMPA assay', 'Permeability of the compound', 'Permeability of the compound at pH 7.4 incubated for 30 mins by UV plate reader based PAMPA assay', 'Permeability of the compound by PAMPA assay at pH 7.4', 'Permeability in RRCK cells', 'Permeability of compound at pH 7.4 by PAMPA', 'Permeability in MDCK cells', 'Permeability across apical to basolateral side in human Caco2 cells measured up to 2 hrs by LC-MS/MS analysis or scintillation counting method', 'Passive permeability in MDCK cells at 2 uM after 90 mins by LC-MS/MS analysis', 'Apparent permeability of compound at pH 7.4 by artificial membrane permeability assay', 'Permeability across Caco-2 cell membrane', 'Permeability of the compound at pH 5.5 by PAMPA', 'Permeability of the compound by PAMPA-BBB assay', 'Permeability of the compound at pH 5.5 by PAMPA assay', 'Permeability of the compound at 10 uM by PAMPA'], 'Cell Line Models': ['PAMPA', 'Caco-2', 'None', 'Caco2', 'Caco-2', 'PAMPA', 'None', 'PAMPA', 'PAMPA', 'RRCK', 'PAMPA', 'MDCK', 'Caco2', 'MDCK', 'PAMPA', 'Caco-2', 'PAMPA', 'PAMPA-BBB', 'PAMPA', 'PAMPA'], 'Temperature Conditions': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'Permeability Assays': ['None', 'None', 'None', 'basolateral to apical', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None', 'apical to basolateral', 'None', 'None', 'None', 'None', 'None', 'None', 'None'], 'pH Levels': ['7.4', 'None', 'None', 'None', 'None', 'None', 'None', '7.4', '7.4', 'None', '7.4', 'None', 'None', 'None', '7.4', 'None', '5.5', 'None', '5.5', 'None'], 'Concentration and Dosing Parameters': ['None', 'None', 'None', 'None', 'None', 'None', 'None', 'incubated for 30 mins', 'None', 'None', 'None', 'None', 'None', '2 uM', 'None', 'None', 'None', 'None', 'None', '10 uM'], 'Blood_Brain_Barrier': [False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]}}\n```

"""

In [None]:
def replace_single_with_double_quotes(code_str):
    return code_str.replace("'", '"')


mes = replace_single_with_double_quotes(mes)


In [None]:
thread = client.beta.threads.create(
  messages=[
    {
      "role": "user",
      "content": mes,
      "file_ids": assistant.file_ids
    }
  ]
)

In [None]:
thread.id

In [None]:
len(assay_description)

In [None]:
for i in tqdm(range(0,len(assay_description),20)):
    info = ' \n '.join(assay_description[i:i+20])
    
    chatGPT_replay(client,thread.id, assistant, question_content=info)
    time.sleep(3)
    chatGPT_check_replay(client,thread, dis=False)
    
    thread_messages = client.beta.threads.messages.list(thread.id)

    try:
        answer = extract_code_script_from_markdown(thread_messages.data[0].content[0].text.value)[0]
        answer = pd.DataFrame(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../me(find_dictionaries_in_string(answer)[0])
        answer.to_csv(f'../data/data_mining_results/{prop}/batch/{prop}_batch_{i}.csv',index=False)
    except:
        print(f'error for {i}')
        display(thread_messages.data[0].content[0].text.value)
        continue
    

# Combine result

In [None]:
import os
import pandas as pd
root_path = '../data/data_mining_results/Blood_Brain_Barrier/batch/'
result = pd.DataFrame()
for file in os.listdir(root_path):
    if 'csv' in file:
        file_path = os.path.join(root_path,file)
        tmp = pd.read_csv(file_path)
        result = pd.concat([result,tmp])

In [None]:
result.columns

In [None]:
result = result[[ 'original sentence', 'Cell Line Models',
       'Temperature Conditions', 'Permeability Assays', 'pH Levels',
       'Concentration and Dosing Parameters', 'Blood_Brain_Barrier']]

In [None]:
result.columns = ['Assay Description','Cell Line Models',
       'Temperature Conditions', 'Permeability Assays', 'pH Levels',
       'Concentration and Dosing Parameters', 'Blood_Brain_Barrier']

In [None]:
df = pd.read_csv('../data/raw_data/bbb/chembl_bbb_raw_data.csv')
col_list = ['Molecule ChEMBL ID', 
       'Smiles', 'Standard Type', 'Standard Relation', 'Standard Value',
       'Standard Units', 'Assay ChEMBL ID', 'Assay Description', 'Assay Type', 'BAO Format ID',
       'BAO Label', 
       'Document ChEMBL ID', 'Source Description',
       'Document Journal', 'Document Year']
df = df[col_list]

In [None]:
df = df.merge(result,on='Assay Description',how='left')
df

In [None]:
df.to_csv('../data/data_mining_results/Blood_Brain_Barrier/chembl_bbb_data_mining.csv',index=False)