In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
sample_df = pd.read_csv("zero_results.csv")
# sample_df = sample_df.sample(200)

cas_df = sample_df[sample_df["CAS"] != "No CAS info"]
missing_df = sample_df[sample_df["CAS"] == "No CAS info"] 
sample_df

Unnamed: 0.1,Unnamed: 0,molecule,CAS
0,0,ACHILLEA MILLEFOLIUM,"977000-16-0, 8022-07-9"
1,1,ALPHA-GALACTOSIDASE,7493-95-0
2,2,ALPHA AMYLASE,"9005-82-7, 6401-81-6, 1109-28-0"
3,3,ALPHA CAROTENE,7488-99-5
4,4,ALPRAZOLAM,28981-97-7
...,...,...,...
157,157,VERBENA OFFICINALIS,977000-41-1
158,158,VITAMIN F,"7771-44-0, 11006-87-4, 506-32-1"
159,159,VITAMIN K,"12001-79-5, 27696-10-2, 81818-54-4"
160,160,XYLANASE,"9025-57-4, 7554-16-7"


In [3]:
cas_df = cas_df[['molecule', 'CAS']]

In [4]:
cas_df

Unnamed: 0,molecule,CAS
0,ACHILLEA MILLEFOLIUM,"977000-16-0, 8022-07-9"
1,ALPHA-GALACTOSIDASE,7493-95-0
2,ALPHA AMYLASE,"9005-82-7, 6401-81-6, 1109-28-0"
3,ALPHA CAROTENE,7488-99-5
4,ALPRAZOLAM,28981-97-7
...,...,...
157,VERBENA OFFICINALIS,977000-41-1
158,VITAMIN F,"7771-44-0, 11006-87-4, 506-32-1"
159,VITAMIN K,"12001-79-5, 27696-10-2, 81818-54-4"
160,XYLANASE,"9025-57-4, 7554-16-7"


In [5]:
def get_second_cas(cas_string):
    # Replace all semicolons with commas
    if len(cas_string.split(',')) == 1:
        return cas_string
    return cas_string.split(',')[1].strip()

In [6]:
# Apply this function to the entire CAS column
cas_df['CAS'] = cas_df['CAS'].apply(get_second_cas)

molecules = list(zip(cas_df['molecule'], cas_df['CAS']))
molecules

# missing_df['CAS'] = missing_df['CAS'].apply(get_first_cas)
# missing_arr = missing_df['CAS'].tolist()

[('ACHILLEA MILLEFOLIUM', '8022-07-9'),
 ('ALPHA-GALACTOSIDASE', '7493-95-0'),
 ('ALPHA AMYLASE', '6401-81-6'),
 ('ALPHA CAROTENE', '7488-99-5'),
 ('ALPRAZOLAM', '28981-97-7'),
 ('ALPROSTADIL', '745-65-3'),
 ('ALTEPLASE', '105913-11-9'),
 ('AMPHOTERICIN B', '1397-89-3'),
 ('AMPICILLIN', '69-53-4'),
 ('AMPRENAVIR', '161814-49-9'),
 ('AMRINONE', '60719-84-8'),
 ('ANTIFUNGAL', '10043-35-3'),
 ('ARALIA RACEMOSA', '89957-50-6'),
 ('ARANEUS DIADEMATUS', '91745-67-4'),
 ('ATOVAQUONE', '95233-18-4'),
 ('AUROTHIOMALIC ACID', '33796-26-8'),
 ('BERBERIS VULGARIS', '84649-92-3'),
 ('BORIC ACID', '10043-35-3'),
 ('BRASSICA NAPUS', '173740-48-2'),
 ('CADEXOMER IODINE', '94820-09-4'),
 ('CADE OIL', '8013-10-3'),
 ('CAFFEINE', '95789-13-2'),
 ('CALAMINE', '12063-19-3'),
 ('CALCIFEDIOL', '64719-49-9'),
 ('CALCIPOTRIOL', '112828-00-9'),
 ('CAPSICUM', '85940-30-3'),
 ('CARICA PAPAYA', '9001-73-4'),
 ('CHLORAL HYDRATE', '302-17-0'),
 ('CHLORAMBUCIL', '305-03-3'),
 ('CHLORAMPHENICOL', '2787-09-9'),
 ('CHLO

In [7]:
import urllib

def setup_webdriver():
    '''Initializes a headless selenium webdriver'''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def cas_url_inxight(molecule):
    cas = molecule[1]
    cas_formatted = cas.replace('-', '%20')
    return f'https://drugs.ncats.io/substances?q=(root_codes_CAS:\"{cas_formatted}\")'

def name_url_inxight(molecule):
    '''Generates an Inxight URL for a given CAS'''
    name = molecule[0]
    cas = molecule[1]
    cas_formatted = cas.replace('-', '%20')
    name_encoded = urllib.parse.quote(f'^{name}')
    return f'https://drugs.ncats.io/substances?q=(root_codes_CAS:\"{cas_formatted}\")%20AND%20(root_names_name:\"{name_encoded}\")'

def name_exact_inxight(molecule):
    '''Generates an Inxight URL for a given CAS'''
    name = molecule[0]
    cas = molecule[1]
    cas_formatted = cas.replace('-', '%20')
    name_encoded = urllib.parse.quote(f'^{name}')
    return f'https://drugs.ncats.io/substances?q=(root_codes_CAS:\"{cas_formatted}\")%20AND%20(root_names_name:\"{name_encoded}$\")'

In [8]:
def get_inxight_url(molecule, driver):
    '''
    Parameters
    ----------
    A molecule's CAS and an initialised webdriver.

    Returns
    -------
    The top Inxight search result for a given CAS number
    '''
    
    url = "N/A"  # Default in case of failure
    query = cas_url_inxight(molecule)
    query_type = "ONLY CAS"
    driver.get(query)
    
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'span#record-count:nth-child(2)')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'span#record-count:nth-child(2)')
        if elements:
            count = int(elements[0].text)
            print(count)
        else:
            print("Element not found.")
            
    except TimeoutException:
        print("Element not found within specified time.")
        count = 0

    if count > 1:
        query = name_exact_inxight(molecule)
        query_type = "EXACT NAME"
        driver.get(query)
    else:
        pass
        
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')

    except (NoSuchElementException, TimeoutException):
        query = name_url_inxight(molecule)
        query_type = "APPROXIMATE NAME"
        driver.get(query)
        
        try:
            WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
            elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
            if elements:
                element = elements[0]
                url = element.get_attribute('href')

        except (NoSuchElementException, TimeoutException):
            query = cas_url_inxight(molecule)
            query_type = "ONLY CAS"
            driver.get(query)

            try:
                WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
                elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
                if elements:
                    element = elements[0]
                    url = element.get_attribute('href')
            
            except (NoSuchElementException, TimeoutException):
                pass
            
    return molecule, url, count, query, query_type

def get_best_urls(mol_arr):
    '''
    Parameters
    ----------
    An array of CAS values

    Returns
    -------
    An array of associated top matches
    '''
    
    best_matches = []
    driver = setup_webdriver()  
    try:
        for molecule in mol_arr:
            molecule, url, count, query, query_type = get_inxight_url(molecule, driver)
            best_matches.append((molecule, url, count, query, query_type))
    finally:
        driver.quit()  

    return best_matches

In [9]:
import requests

def get_additional_data(id):
    r = requests.get(f'https://drugs.ncats.io/api/v1/substances({id})/@additional')
    if 200 == r.status_code:
        return r.json()
    return None

In [10]:
def extract_conditions_and_phases(data):
    conditions_list = []
    highest_approval_list = []

    if data is None:
        return conditions_list, highest_approval_list

    for entry in data:
        if entry['name'] == 'Conditions' and 'value' in entry:
            condition_info = entry['value']
            
            # Extract the condition name
            label = condition_info.get('label')
            if label:
                conditions_list.append(label)
            
            # Extract the highest phase of approval
            highest_phase = condition_info.get('highestPhase')
            if highest_phase:
                highest_approval_list.append(highest_phase)

    return conditions_list, highest_approval_list

In [11]:
# def extract_event_details(data):
#     event_details = {}
    
#     for item in data:
#         if 'value' in item and isinstance(item['value'], dict):  # Ensure 'value' is a dictionary
#             details = item['value']
#             if 'status' in details and 'sourceID' in details:
#                 if item['name'] == 'Highest Development Event' or item['name'] == 'Earliest Approved Event':
#                     # Gather additional details
#                     source_id = details.get('sourceID', 'No Source ID')
#                     source_url = details.get('sourceURL', 'No Source URL')
                    
#                     detail_info = {
#                         'Status and Year': f"{details['status']} {details.get('year', '')}",
#                         'Source ID': source_id,
#                         'Source URL': source_url
#                     }
                    
#                     # Use the 'name' of the event as the key in the dictionary
#                     event_details[item['name']] = detail_info
    
#     return event_details

def extract_event_details(data):
    event_details = {}
    if data is None:  # Check if data is None before iterating
        return event_details

    for item in data:
        if 'value' in item and isinstance(item['value'], dict):
            details = item['value']
            if 'status' in details and 'sourceID' in details:
                if item['name'] in ['Highest Development Event', 'Earliest Approved Event']:
                    detail_info = {
                        'Status and Year': f"{details['status']} {details.get('year', '')}",
                        'Source ID': details.get('sourceID', 'No Source ID'),
                        'Source URL': details.get('sourceURL', 'No Source URL')
                    }
                    event_details[item['name']] = detail_info
    return event_details

In [12]:
links = get_best_urls(molecules)
links

2
Element not found within specified time.
Element not found within specified time.
1
1
1
Element not found within specified time.
1
1
1
1
1
Element not found within specified time.
Element not found within specified time.
1
Element not found within specified time.
Element not found within specified time.
1
Element not found within specified time.
Element not found within specified time.
2
Element not found within specified time.
Element not found within specified time.
Element not found within specified time.
1
Element not found within specified time.
1
1
1
Element not found within specified time.
1
1
1
Element not found within specified time.
1
1
1
1
1
1
1
1
Element not found within specified time.
1
1
Element not found within specified time.
Element not found within specified time.
1
1
1
1
1
Element not found within specified time.
Element not found within specified time.
Element not found within specified time.
Element not found within specified time.
1
1
Element not found within s

[(('ACHILLEA MILLEFOLIUM', '8022-07-9'),
  'https://drugs.ncats.io/drug/97P5D0WG43',
  2,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:"8022%2007%209")%20AND%20(root_names_name:"%5EACHILLEA%20MILLEFOLIUM")',
  'APPROXIMATE NAME'),
 (('ALPHA-GALACTOSIDASE', '7493-95-0'),
  'N/A',
  0,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:"7493%2095%200")',
  'ONLY CAS'),
 (('ALPHA AMYLASE', '6401-81-6'),
  'N/A',
  0,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:"6401%2081%206")',
  'ONLY CAS'),
 (('ALPHA CAROTENE', '7488-99-5'),
  'https://drugs.ncats.io/drug/45XWE1Z69V',
  1,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:"7488%2099%205")',
  'ONLY CAS'),
 (('ALPRAZOLAM', '28981-97-7'),
  'https://drugs.ncats.io/drug/YU55MQ3IZY',
  1,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:"28981%2097%207")',
  'ONLY CAS'),
 (('ALPROSTADIL', '745-65-3'),
  'https://drugs.ncats.io/drug/F5TD010360',
  1,
  'https://drugs.ncats.io/substances?q=(root_codes_CAS:

In [13]:
# new_links_df = pd.DataFrame(links, columns=['molecule', 'query_url', 'results', 'best_match_url', ''])
# new_links_df.head(2)
# links = new_links_df['best_match_url'].to_list()
# links
# identifiers = [url.split('/')[-1] for _, url in links]
# identifiers

data = []
for link in links:
    molecule_name, cas_number = link[0]
    drug_url, value, substance_url, query_type = link[1:]
    data.append([molecule_name, cas_number, drug_url, value, substance_url, query_type])
    
new_links_df = pd.DataFrame(data, columns=['molecule', 'cas', 'best_match_url', 'results', 'query_url', 'query_type'])
new_links_df.head(2)

Unnamed: 0,molecule,cas,best_match_url,results,query_url,query_type
0,ACHILLEA MILLEFOLIUM,8022-07-9,https://drugs.ncats.io/drug/97P5D0WG43,2,https://drugs.ncats.io/substances?q=(root_code...,APPROXIMATE NAME
1,ALPHA-GALACTOSIDASE,7493-95-0,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS


In [14]:
identifier_arr = new_links_df['best_match_url'].to_list()
identifier_arr
identifiers = [url.split('/')[-1] for url in identifier_arr]

In [15]:
identifiers = [identifier if identifier != 'A' else 'MISSING' for identifier in identifiers]

In [16]:
len(identifiers)

162

In [17]:
def extract_events(identifiers):
    data = []
    events = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        print(extract_event_details(data))
        events.append(extract_event_details(data))

    return events
        
events = extract_events(identifiers)

# def extract_events(identifiers):
#     events = []
#     for identifier in identifiers:
#         data = get_additional_data(identifier)
#         if data is not None:  # Check if data is None
#             event_details = extract_event_details(data)
#             events.append(event_details)
#         else:
#             print(f"No data available for identifier {identifier}")  # Or handle it differently
#     return events

{'Highest Development Event': {'Status and Year': 'Possibly Marketed Outside US 2016', 'Source ID': '21 CFR 346', 'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=15659e24-7446-498d-9bd3-8d0166bbb994'}, 'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 2016', 'Source ID': '21 CFR 348', 'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=2ebdfffc-c9cd-31df-e054-00144ff88e88'}}
{}
{}
{}
{'Highest Development Event': {'Status and Year': 'US Approved Rx 2010', 'Source ID': 'ANDA090248', 'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=090248'}, 'Earliest Approved Event': {'Status and Year': 'US Approved Rx 1981', 'Source ID': 'NDA018276', 'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=018276'}}
{'Highest Development Event': {'Status and Year': 'US Approved Rx 1996', 'Source ID': 'NDA020700', 'Source URL': 'https:

In [18]:
def extract_conditions(identifiers):
    conditions = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        conditions.append(extract_conditions_and_phases(data))

    return conditions
        
conditions_arr = extract_conditions(identifiers)

In [19]:
events

[{'Highest Development Event': {'Status and Year': 'Possibly Marketed Outside US 2016',
   'Source ID': '21 CFR 346',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=15659e24-7446-498d-9bd3-8d0166bbb994'},
  'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 2016',
   'Source ID': '21 CFR 348',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=2ebdfffc-c9cd-31df-e054-00144ff88e88'}},
 {},
 {},
 {},
 {'Highest Development Event': {'Status and Year': 'US Approved Rx 2010',
   'Source ID': 'ANDA090248',
   'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=090248'},
  'Earliest Approved Event': {'Status and Year': 'US Approved Rx 1981',
   'Source ID': 'NDA018276',
   'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=018276'}},
 {'Highest Development Event': {'Status and Year': 'US Approved Rx 1996',
   'Sour

In [20]:
conditions_arr

[([], []),
 ([], []),
 ([], []),
 (['Lewis lung carcinoma'], ['Preclinical']),
 (['Panic disorder', 'Anxiety'], ['Approved', 'Approved']),
 (['Peripheral vascular disorders', 'congenital heart defects'],
  ['Phase III', 'Approved']),
 ([], []),
 (['Cryptococcosis',
   'Zygomycosis',
   'Histoplasmosis',
   'Visceral leishmaniasis',
   'Mucormycosis',
   'Coccidioidomycosis',
   'Blastomycosis',
   'Candida infections',
   'Aspergillus infections',
   'Sporotrichosis',
   'Naeglerias',
   'Mycoses'],
  ['Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Approved',
   'Preclinical',
   'Preclinical']),
 (['Bacterial Infections'], ['Approved']),
 (['HIV-1 infection'], ['Approved']),
 (['Congestive heart failure'], ['Approved']),
 ([], []),
 ([], []),
 ([], []),
 (['Pneumocystis jiroveci pneumonia'], ['Approved']),
 ([], []),
 ([], []),
 ([], []),
 ([], []),
 ([], []),
 ([], []),
 ([], []),
 ([], []),
 ([]

In [21]:
data = []
for event in events:
    record = {}
    for key, value in event.items():
        for sub_key, sub_value in value.items():
            record[f"{key} {sub_key}"] = sub_value
    data.append(record)

df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL
0,Possibly Marketed Outside US 2016,21 CFR 346,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 2016,21 CFR 348,https://dailymed.nlm.nih.gov/dailymed/drugInfo...
1,,,,,,
2,,,,,,
3,,,,,,
4,US Approved Rx 2010,ANDA090248,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 1981,NDA018276,https://www.accessdata.fda.gov/scripts/cder/da...
...,...,...,...,...,...,...
157,,,,,,
158,,,,,,
159,,,,,,
160,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs


In [23]:
rows = [{'conditions': conditions, 'phases': phases} for conditions, phases in conditions_arr]

# Create DataFrame
conditions_df = pd.DataFrame(rows)
conditions_df['conditions'] = conditions_df['conditions'].apply(lambda x: '; '.join(x))
conditions_df['phases'] = conditions_df['phases'].apply(lambda x: '; '.join(x))
conditions_df

Unnamed: 0,conditions,phases
0,,
1,,
2,,
3,Lewis lung carcinoma,Preclinical
4,Panic disorder; Anxiety,Approved; Approved
...,...,...
157,,
158,,
159,,
160,,


In [24]:
merged = pd.concat([df, conditions_df], axis=1)
merged

Unnamed: 0,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
0,Possibly Marketed Outside US 2016,21 CFR 346,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 2016,21 CFR 348,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,,
1,,,,,,,,
2,,,,,,,,
3,,,,,,,Lewis lung carcinoma,Preclinical
4,US Approved Rx 2010,ANDA090248,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 1981,NDA018276,https://www.accessdata.fda.gov/scripts/cder/da...,Panic disorder; Anxiety,Approved; Approved
...,...,...,...,...,...,...,...,...
157,,,,,,,,
158,,,,,,,,
159,,,,,,,,
160,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs,,


In [25]:
new_links_df

Unnamed: 0,molecule,cas,best_match_url,results,query_url,query_type
0,ACHILLEA MILLEFOLIUM,8022-07-9,https://drugs.ncats.io/drug/97P5D0WG43,2,https://drugs.ncats.io/substances?q=(root_code...,APPROXIMATE NAME
1,ALPHA-GALACTOSIDASE,7493-95-0,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
2,ALPHA AMYLASE,6401-81-6,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
3,ALPHA CAROTENE,7488-99-5,https://drugs.ncats.io/drug/45XWE1Z69V,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
4,ALPRAZOLAM,28981-97-7,https://drugs.ncats.io/drug/YU55MQ3IZY,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
...,...,...,...,...,...,...
157,VERBENA OFFICINALIS,977000-41-1,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
158,VITAMIN F,11006-87-4,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
159,VITAMIN K,27696-10-2,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS
160,XYLANASE,7554-16-7,https://drugs.ncats.io/drug/T91WJ82JOZ,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS


In [26]:
result = pd.concat([new_links_df, merged], axis=1)
result

Unnamed: 0,molecule,cas,best_match_url,results,query_url,query_type,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
0,ACHILLEA MILLEFOLIUM,8022-07-9,https://drugs.ncats.io/drug/97P5D0WG43,2,https://drugs.ncats.io/substances?q=(root_code...,APPROXIMATE NAME,Possibly Marketed Outside US 2016,21 CFR 346,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 2016,21 CFR 348,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,,
1,ALPHA-GALACTOSIDASE,7493-95-0,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
2,ALPHA AMYLASE,6401-81-6,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
3,ALPHA CAROTENE,7488-99-5,https://drugs.ncats.io/drug/45XWE1Z69V,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,Lewis lung carcinoma,Preclinical
4,ALPRAZOLAM,28981-97-7,https://drugs.ncats.io/drug/YU55MQ3IZY,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2010,ANDA090248,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 1981,NDA018276,https://www.accessdata.fda.gov/scripts/cder/da...,Panic disorder; Anxiety,Approved; Approved
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,VERBENA OFFICINALIS,977000-41-1,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
158,VITAMIN F,11006-87-4,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
159,VITAMIN K,27696-10-2,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
160,XYLANASE,7554-16-7,https://drugs.ncats.io/drug/T91WJ82JOZ,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs,US Previously Marketed 1951,Mytolon Chloride by Winthrop,DeHaen 1940-1975 NMEs,,


In [29]:
zeros_df = result[result['results'] == 0]
zeros_df

Unnamed: 0,molecule,cas,best_match_url,results,query_url,query_type,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
1,ALPHA-GALACTOSIDASE,7493-95-0,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
2,ALPHA AMYLASE,6401-81-6,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
6,ALTEPLASE,105913-11-9,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
12,ARALIA RACEMOSA,89957-50-6,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
13,ARANEUS DIADEMATUS,91745-67-4,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152,VALERIANA OFFICINALIS,81397-67-3,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
157,VERBENA OFFICINALIS,977000-41-1,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
158,VITAMIN F,11006-87-4,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,
159,VITAMIN K,27696-10-2,,0,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,,


In [30]:
found = result[result['results'] != 0]

In [31]:
found

Unnamed: 0,molecule,cas,best_match_url,results,query_url,query_type,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
0,ACHILLEA MILLEFOLIUM,8022-07-9,https://drugs.ncats.io/drug/97P5D0WG43,2,https://drugs.ncats.io/substances?q=(root_code...,APPROXIMATE NAME,Possibly Marketed Outside US 2016,21 CFR 346,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 2016,21 CFR 348,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,,
3,ALPHA CAROTENE,7488-99-5,https://drugs.ncats.io/drug/45XWE1Z69V,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,,,,,,,Lewis lung carcinoma,Preclinical
4,ALPRAZOLAM,28981-97-7,https://drugs.ncats.io/drug/YU55MQ3IZY,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2010,ANDA090248,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 1981,NDA018276,https://www.accessdata.fda.gov/scripts/cder/da...,Panic disorder; Anxiety,Approved; Approved
5,ALPROSTADIL,745-65-3,https://drugs.ncats.io/drug/F5TD010360,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 1996,NDA020700,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 1981,NDA018484,https://www.accessdata.fda.gov/scripts/cder/da...,Peripheral vascular disorders; congenital hear...,Phase III; Approved
7,AMPHOTERICIN B,1397-89-3,https://drugs.ncats.io/drug/7XU7A7DROE,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2021,ANDA212514,https://www.accessdata.fda.gov/scripts/cder/da...,US Previously Marketed 1958,Fungizone by Squibb,DeHaen 1940-1975 NMEs,Cryptococcosis; Zygomycosis; Histoplasmosis; V...,Approved; Approved; Approved; Approved; Approv...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,VARENICLINE,249296-44-4,https://drugs.ncats.io/drug/W6HS99O8ZO,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2023,ANDA201962,https://www.accessdata.fda.gov/scripts/cder/da...,US Previously Marketed 2006,CHANTIX by PF PRISM CV,https://www.accessdata.fda.gov/scripts/cder/da...,Nicotine dependence,Approved
154,VASOPRESSIN,150683-30-0,https://drugs.ncats.io/drug/21G72T1950,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2018,NDA204441,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 2009,NDA022275,https://www.accessdata.fda.gov/scripts/cder/da...,Hypervolemic and euvolemic hyponatremia,Approved
155,VECURONIUM BROMIDE,50700-72-6,https://drugs.ncats.io/drug/7E4PHP5N1D,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2010,ANDA090243,https://www.accessdata.fda.gov/scripts/cder/da...,US Previously Marketed 1981,Isoptin by Knoll,OB NME Appendix 1950-1985,Postoperative complications,Approved
156,VEDOLIZUMAB,943609-66-3,https://drugs.ncats.io/drug/9RV78Q2002,1,https://drugs.ncats.io/substances?q=(root_code...,ONLY CAS,US Approved Rx 2014,BLA125476,https://www.accessdata.fda.gov/scripts/cder/da...,US Approved Rx 2014,BLA125476,https://www.accessdata.fda.gov/scripts/cder/da...,,


In [32]:
zeros_df.to_csv("zero_results_second.csv")
found.to_csv("found_results_second.csv")