In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
sample_df = pd.read_csv("PUBCHEM_SYNONYMS_FLAGGED.csv")
sample_df = sample_df.head(10)

cas_df = sample_df[sample_df["CAS"] != "No CAS info"]
missing_df = sample_df[sample_df["CAS"] == "No CAS info"] 

In [3]:
cas_df = cas_df[['Parsed Molecule', 'CAS']]
missing_df = missing_df[['Parsed Molecule', 'CAS']]

In [4]:
cas_df

Unnamed: 0,Parsed Molecule,CAS
0,"1,2 PROPANEDIOL DIACETATE",623-84-7
1,1 HEXADECANOL,"36653-82-4, 36311-34-9, 124-29-8"
2,1 OCTADECANOL,"112-92-5, 68911-61-5, 26762-44-7"
4,2 OXOGLUTARIC ACID,"328-50-7, 34410-46-3, 17091-15-5"
6,2 PROPANOL,67-63-0
7,4 AMINOBUTYRIC ACID,56-12-2
9,7 OXO-DEHYDROEPIANDROSTERONE,566-19-8


In [5]:
missing_df

Unnamed: 0,Parsed Molecule,CAS
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",No CAS info
5,2 PHENOXYETHANOL,No CAS info
8,7 KETO DEHYDRANDROSTERONE,No CAS info


In [6]:
def get_first_cas(cas_string):
    if cas_string == "No CAS info":
        return 
    return cas_string.split(',')[0].strip()

# Apply this function to the entire CAS column
cas_df['CAS'] = cas_df['CAS'].apply(get_first_cas)
cas_arr = cas_df['CAS'].tolist()

missing_df['CAS'] = missing_df['CAS'].apply(get_first_cas)
missing_arr = missing_df['CAS'].tolist()

In [7]:
def setup_webdriver():
    '''Initializes a headless selenium webdriver'''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def url_inxight(cas):
    '''Generates an Inxight URL for a given CAS'''
    return f"https://drugs.ncats.io/substances?q={cas}"

In [8]:
def get_inxight_url(molecule, driver):
    '''
    Parameters
    ----------
    A molecule's CAS and an initialised webdriver.

    Returns
    -------
    The top Inxight search result for a given CAS number
    '''
    
    url = "N/A"  # Default in case of failure

    driver.get(url_inxight(molecule))
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')

    except (NoSuchElementException, TimeoutException):
        pass  

    return molecule, url

def get_best_urls(cas_arr):
    '''
    Parameters
    ----------
    An array of CAS values

    Returns
    -------
    An array of associated top matches
    '''
    
    best_matches = []
    driver = setup_webdriver()  
    try:
        for cas in cas_arr:
            molecule, url = get_inxight_url(cas, driver)
            best_matches.append((molecule, url))
    finally:
        driver.quit()  

    return best_matches

In [9]:
import requests

def get_additional_data(id):
    r = requests.get(f'https://drugs.ncats.io/api/v1/substances({id})/@additional')
    if 200 == r.status_code:
        return r.json()
    return None

In [10]:
def extract_conditions_and_phases(data):
    conditions_list = []
    highest_approval_list = []

    for entry in data:
        if entry['name'] == 'Conditions' and 'value' in entry:
            condition_info = entry['value']
            
            # Extract the condition name
            label = condition_info.get('label')
            if label:
                conditions_list.append(label)
            
            # Extract the highest phase of approval
            highest_phase = condition_info.get('highestPhase')
            if highest_phase:
                highest_approval_list.append(highest_phase)

    return conditions_list, highest_approval_list

In [11]:
def extract_event_details(data):
    event_details = {}
    
    for item in data:
        if 'value' in item and isinstance(item['value'], dict):  # Ensure 'value' is a dictionary
            details = item['value']
            if 'status' in details and 'year' in details:
                # Gather additional details
                source_id = details.get('sourceID', 'No Source ID')
                source_url = details.get('sourceURL', 'No Source URL')
                
                # Create a detailed string or dictionary as needed
                detail_info = {
                    'Status and Year': f"{details['status']} {details['year']}",
                    'Source ID': source_id,
                    'Source URL': source_url
                }
                
                # Use the 'name' of the event as the key in the dictionary
                event_details[item['name']] = detail_info
    
    return event_details

In [12]:
links = get_best_urls(cas_arr)
links

[('623-84-7', 'https://drugs.ncats.io/drug/5Z492UNF9O'),
 ('36653-82-4', 'https://drugs.ncats.io/drug/936JST6JCN'),
 ('112-92-5', 'https://drugs.ncats.io/drug/2KR89I4H1Y'),
 ('328-50-7', 'https://drugs.ncats.io/drug/8ID597Z82X'),
 ('67-63-0', 'https://drugs.ncats.io/drug/ND2M416302'),
 ('56-12-2', 'https://drugs.ncats.io/drug/2ACZ6IPC6I'),
 ('566-19-8', 'https://drugs.ncats.io/drug/2334LJD2E9')]

In [13]:
identifiers = [url.split('/')[-1] for _, url in links]
identifiers

['5Z492UNF9O',
 '936JST6JCN',
 '2KR89I4H1Y',
 '8ID597Z82X',
 'ND2M416302',
 '2ACZ6IPC6I',
 '2334LJD2E9']

In [21]:
data = get_additional_data('2KR89I4H1Y')
data

[{'value': 'Broad Institute Drug List 2024-03-05', 'name': 'Stitcher Label'},
 {'value': 'Clinical', 'name': 'Stitcher Label'},
 {'value': 'DailyMed Rx, December 2021', 'name': 'Stitcher Label'},
 {'value': 'Excipient', 'name': 'Stitcher Label'},
 {'value': 'FDA Excipients, December 2018', 'name': 'Stitcher Label'},
 {'value': 'FRDB, October 2021', 'name': 'Stitcher Label'},
 {'value': 'G-SRS, December 2023', 'name': 'Stitcher Label'},
 {'value': 'Marketed', 'name': 'Stitcher Label'},
 {'value': 'NCATS Pharmaceutical Collection, April 2012',
  'name': 'Stitcher Label'},
 {'value': 'OTC Monographs, December 2018', 'name': 'Stitcher Label'},
 {'value': 'Other', 'name': 'Stitcher Label'},
 {'value': 'SGROUP', 'name': 'Stitcher Label'},
 {'value': 'S_STITCH_V1', 'name': 'Stitcher Label'},
 {'value': 'USPreviouslyMarketed', 'name': 'Stitcher Label'},
 {'value': 'Principal Form', 'name': 'Substance Form'},
 {'name': 'PubMed'},
 {'name': 'PubMed'},
 {'name': 'PubMed'},
 {'name': 'PubMed'},
 {

In [18]:
data = get_additional_data('8ID597Z82X')
data

[{'value': 'Broad Institute Drug List 2024-03-05', 'name': 'Stitcher Label'},
 {'value': 'Clinical', 'name': 'Stitcher Label'},
 {'value': 'DrugBank, July 2020', 'name': 'Stitcher Label'},
 {'value': 'FRDB, October 2021', 'name': 'Stitcher Label'},
 {'value': 'G-SRS, December 2023', 'name': 'Stitcher Label'},
 {'value': 'Marketed', 'name': 'Stitcher Label'},
 {'value': 'NCATS Pharmaceutical Collection, April 2012',
  'name': 'Stitcher Label'},
 {'value': 'SGROUP', 'name': 'Stitcher Label'},
 {'value': 'S_STITCH_V1', 'name': 'Stitcher Label'},
 {'value': 'Salts, Hydrates, Esters, etc.', 'name': 'Substance Form'},
 {'name': 'PubMed',
  'value': {'uid': '18161514',
   'pubdate': '2008',
   'epubdate': '',
   'source': 'Drug Chem Toxicol',
   'authors': [{'name': 'Bhattacharya R',
     'authtype': 'Author',
     'clusterid': ''},
    {'name': 'Tulsawani R', 'authtype': 'Author', 'clusterid': ''}],
   'lastauthor': 'Tulsawani R',
   'title': 'In vitro and in vivo evaluation of various carbo