In [32]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [33]:
sample_df = pd.read_csv("PUBCHEM_SYNONYMS_FLAGGED.csv")
sample_df = sample_df.sample(100)

cas_df = sample_df[sample_df["CAS"] != "No CAS info"]
missing_df = sample_df[sample_df["CAS"] == "No CAS info"] 

In [34]:
cas_df = cas_df[['Parsed Molecule', 'CAS']]
missing_df = missing_df[['Parsed Molecule', 'CAS']]

In [35]:
cas_df

Unnamed: 0,Parsed Molecule,CAS
537,CHOLESTEROL,"57-88-5, 22243-67-0"
2411,SHARK CARTILAGE,305838-77-1
195,ARIPIPRAZOLE,"129722-12-9, 851220-85-4"
2882,XYLOSE,"10257-31-5, 50855-32-8, 25990-60-7, 58-86-6"
1346,INDINAVIR,150378-17-9
...,...,...
1668,METARAMINOL,54-49-9
269,BEDAQUILINE,"843663-66-1, 654653-93-7"
1333,IMMUNOGLOBULIN ANTI-HEPATITIS B SURFACE ANTIGEN,569658-79-3
1691,METHYLENE BLUE,"61-73-4, 7220-79-3, 97130-83-1"


In [36]:
missing_df

Unnamed: 0,Parsed Molecule,CAS
2132,POLYGALA SENEGA,No CAS info
1763,MULTIVITAMINS AND MINERALS,No CAS info
2031,PERSEA GRATISSIMA,No CAS info
171,APHANIZOMENON FLO AQUAE,No CAS info
2135,POLYPODIUM LEUCOTOMOS,No CAS info
1010,EUPHORBIA CERIFERA,No CAS info
1458,LACTOBACILLUS BIFIDUS,No CAS info
2334,RUMEX ACETOSA,No CAS info
1013,EUTERPE OLERACEA,No CAS info
2789,VACCINE- ROTAVIRUS,No CAS info


In [37]:
def get_first_cas(cas_string):
    # Replace all semicolons with commas
    modified_string = re.sub(r';', ',', cas_string)
    
    if modified_string == "No CAS info":
        return None
    
    return modified_string.split(',')[0].strip()

# Apply this function to the entire CAS column
cas_df['CAS'] = cas_df['CAS'].apply(get_first_cas)
cas_arr = cas_df['CAS'].tolist()

missing_df['CAS'] = missing_df['CAS'].apply(get_first_cas)
missing_arr = missing_df['CAS'].tolist()

In [38]:
def setup_webdriver():
    '''Initializes a headless selenium webdriver'''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def url_inxight(cas):
    '''Generates an Inxight URL for a given CAS'''
    cas_formatted = cas.replace('-', '%20')
    return f"https://drugs.ncats.io/substances?q=root_codes_CAS:%22{cas_formatted}%22"

In [39]:
def get_inxight_url(molecule, driver):
    '''
    Parameters
    ----------
    A molecule's CAS and an initialised webdriver.

    Returns
    -------
    The top Inxight search result for a given CAS number
    '''
    
    url = "N/A"  # Default in case of failure

    driver.get(url_inxight(molecule))
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')

    except (NoSuchElementException, TimeoutException):
        pass  

    return molecule, url

def get_best_urls(cas_arr):
    '''
    Parameters
    ----------
    An array of CAS values

    Returns
    -------
    An array of associated top matches
    '''
    
    best_matches = []
    driver = setup_webdriver()  
    try:
        for cas in cas_arr:
            molecule, url = get_inxight_url(cas, driver)
            best_matches.append((molecule, url))
    finally:
        driver.quit()  

    return best_matches

In [40]:
import requests

def get_additional_data(id):
    r = requests.get(f'https://drugs.ncats.io/api/v1/substances({id})/@additional')
    if 200 == r.status_code:
        return r.json()
    return None

In [41]:
def extract_conditions_and_phases(data):
    conditions_list = []
    highest_approval_list = []

    for entry in data:
        if entry['name'] == 'Conditions' and 'value' in entry:
            condition_info = entry['value']
            
            # Extract the condition name
            label = condition_info.get('label')
            if label:
                conditions_list.append(label)
            
            # Extract the highest phase of approval
            highest_phase = condition_info.get('highestPhase')
            if highest_phase:
                highest_approval_list.append(highest_phase)

    return conditions_list, highest_approval_list

In [42]:
def extract_event_details(data):
    event_details = {}
    
    for item in data:
        if 'value' in item and isinstance(item['value'], dict):  # Ensure 'value' is a dictionary
            details = item['value']
            if 'status' in details and 'sourceID' in details:
                if item['name'] == 'Highest Development Event' or item['name'] == 'Earliest Approved Event':
                    # Gather additional details
                    source_id = details.get('sourceID', 'No Source ID')
                    source_url = details.get('sourceURL', 'No Source URL')
                    
                    detail_info = {
                        'Status and Year': f"{details['status']} {details.get('year', '')}",
                        'Source ID': source_id,
                        'Source URL': source_url
                    }
                    
                    # Use the 'name' of the event as the key in the dictionary
                    event_details[item['name']] = detail_info
    
    return event_details

# def extract_event_details(data):
#     event_details = {}
#     if data is None:  # Check if data is None before iterating
#         return event_details

#     for item in data:
#         if 'value' in item and isinstance(item['value'], dict):
#             details = item['value']
#             if 'status' in details and 'sourceID' in details:
#                 if item['name'] in ['Highest Development Event', 'Earliest Approved Event']:
#                     detail_info = {
#                         'Status and Year': f"{details['status']} {details.get('year', '')}",
#                         'Source ID': details.get('sourceID', 'No Source ID'),
#                         'Source URL': details.get('sourceURL', 'No Source URL')
#                     }
#                     event_details[item['name']] = detail_info
#     return event_details

In [43]:
links = get_best_urls(cas_arr)
links

[('57-88-5', 'https://drugs.ncats.io/drug/97C5T2UQ7J'),
 ('305838-77-1', 'https://drugs.ncats.io/drug/D2YCN1I522'),
 ('129722-12-9', 'https://drugs.ncats.io/drug/82VFR53I78'),
 ('10257-31-5', 'N/A'),
 ('150378-17-9', 'https://drugs.ncats.io/drug/9MG78X43ZT'),
 ('1491-59-4', 'https://drugs.ncats.io/drug/8VLN5B44ZY'),
 ('1338225-97-0', 'https://drugs.ncats.io/drug/913P6LK81M'),
 ('120138-50-3', 'https://drugs.ncats.io/drug/23OW28RS7P'),
 ('121-75-5', 'https://drugs.ncats.io/drug/U5N7SU872W'),
 ('72559-06-9', 'https://drugs.ncats.io/drug/1W306TDA6S'),
 ('134523-00-5', 'https://drugs.ncats.io/drug/A0JWA85V8F'),
 ('3380-34-5', 'https://drugs.ncats.io/drug/4NM5039Y5X'),
 ('68-19-9', 'https://drugs.ncats.io/drug/P6YC3EG204'),
 ('53320-86-8', 'https://drugs.ncats.io/drug/D703131383'),
 ('77-86-1', 'https://drugs.ncats.io/drug/023C2WHX2V'),
 ('138402-11-6', 'https://drugs.ncats.io/drug/J0E2756Z7N'),
 ('29110-47-2', 'https://drugs.ncats.io/drug/30OMY4G3MK'),
 ('472-70-8', 'https://drugs.ncats.io

In [44]:
url_inxight("9005-49-6")

'https://drugs.ncats.io/substances?q=root_codes_CAS:%229005%2049%206%22'

In [13]:
identifiers = [url.split('/')[-1] for _, url in links]
identifiers

['5Z492UNF9O', '936JST6JCN', '2KR89I4H1Y']

In [14]:
identifiers = [identifier if identifier != 'A' else 'MISSING' for identifier in identifiers]

In [15]:
def extract_events(identifiers):
    data = []
    events = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        print(extract_event_details(data))
        events.append(extract_event_details(data))

    return events
        
events = extract_events(identifiers)

# def extract_events(identifiers):
#     events = []
#     for identifier in identifiers:
#         data = get_additional_data(identifier)
#         if data is not None:  # Check if data is None
#             event_details = extract_event_details(data)
#             events.append(event_details)
#         else:
#             print(f"No data available for identifier {identifier}")  # Or handle it differently
#     return events

{'Highest Development Event': {'Status and Year': 'Possibly Marketed Outside US 1996', 'Source ID': 'ANDA040166', 'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'}, 'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1996', 'Source ID': 'ANDA040166', 'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'}}
{'Highest Development Event': {'Status and Year': 'US Previously Marketed 1990', 'Source ID': 'EXOSURF NEONATAL by GLAXOSMITHKLINE', 'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=020044'}, 'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1984', 'Source ID': 'NU-DERM SUNFADER  Skin Lightener with Sunscreen (SPF 15) PABA FREE by OMP, INC.', 'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c67b0629-5529-412d-99ee-186df4ecad5a'}}
{'Highest Dev

In [16]:
def extract_conditions(identifiers):
    conditions = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        conditions.append(extract_conditions_and_phases(data))

    return conditions
        
conditions_arr = extract_conditions(identifiers)

In [17]:
events

[{'Highest Development Event': {'Status and Year': 'Possibly Marketed Outside US 1996',
   'Source ID': 'ANDA040166',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'},
  'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1996',
   'Source ID': 'ANDA040166',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'}},
 {'Highest Development Event': {'Status and Year': 'US Previously Marketed 1990',
   'Source ID': 'EXOSURF NEONATAL by GLAXOSMITHKLINE',
   'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=020044'},
  'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1984',
   'Source ID': 'NU-DERM SUNFADER  Skin Lightener with Sunscreen (SPF 15) PABA FREE by OMP, INC.',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c67b0629-5529-412d-99e

In [18]:
conditions_arr

[(['Bacillus cereus infection'], ['Basic research']),
 (['Dry, itchy skin', 'Respiratory distress syndrome, neonatal'],
  ['Approved', 'Approved']),
 (['SUNBURN'], ['Approved'])]

In [19]:
data = []
for event in events:
    record = {}
    for key, value in event.items():
        for sub_key, sub_value in value.items():
            record[f"{key} {sub_key}"] = sub_value
    data.append(record)

df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL
0,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...
1,US Previously Marketed 1990,EXOSURF NEONATAL by GLAXOSMITHKLINE,https://www.accessdata.fda.gov/scripts/cder/da...,Possibly Marketed Outside US 1984,NU-DERM SUNFADER Skin Lightener with Sunscree...,https://dailymed.nlm.nih.gov/dailymed/drugInfo...
2,US Previously Marketed,21 CFR 310.545(a)(18)(i)(B) skin protectant:w/...,https://www.gpo.gov/fdsys/pkg/CFR-2018-title21...,Possibly Marketed Outside US 1970,Ala Quin by Crown Laboratories,https://dailymed.nlm.nih.gov/dailymed/drugInfo...


In [21]:
rows = [{'conditions': conditions, 'phases': phases} for conditions, phases in conditions_arr]

# Create DataFrame
conditions_df = pd.DataFrame(rows)
conditions_df['conditions'] = conditions_df['conditions'].apply(lambda x: '; '.join(x))
conditions_df['phases'] = conditions_df['phases'].apply(lambda x: '; '.join(x))
conditions_df

Unnamed: 0,conditions,phases
0,Bacillus cereus infection,Basic research
1,"Dry, itchy skin; Respiratory distress syndrome...",Approved; Approved
2,SUNBURN,Approved


In [22]:
merged = pd.concat([df, conditions_df], axis=1)
merged

Unnamed: 0,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
0,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Bacillus cereus infection,Basic research
1,US Previously Marketed 1990,EXOSURF NEONATAL by GLAXOSMITHKLINE,https://www.accessdata.fda.gov/scripts/cder/da...,Possibly Marketed Outside US 1984,NU-DERM SUNFADER Skin Lightener with Sunscree...,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,"Dry, itchy skin; Respiratory distress syndrome...",Approved; Approved
2,US Previously Marketed,21 CFR 310.545(a)(18)(i)(B) skin protectant:w/...,https://www.gpo.gov/fdsys/pkg/CFR-2018-title21...,Possibly Marketed Outside US 1970,Ala Quin by Crown Laboratories,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,SUNBURN,Approved


In [23]:
links_df = pd.DataFrame(links, columns=['CAS', 'URL'])
links_df

Unnamed: 0,CAS,URL
0,623-84-7,https://drugs.ncats.io/drug/5Z492UNF9O
1,36653-82-4,https://drugs.ncats.io/drug/936JST6JCN
2,112-92-5,https://drugs.ncats.io/drug/2KR89I4H1Y


In [24]:
result = pd.concat([links_df, merged], axis=1)
result

Unnamed: 0,CAS,URL,Highest Development Event Status and Year,Highest Development Event Source ID,Highest Development Event Source URL,Earliest Approved Event Status and Year,Earliest Approved Event Source ID,Earliest Approved Event Source URL,conditions,phases
0,623-84-7,https://drugs.ncats.io/drug/5Z492UNF9O,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Possibly Marketed Outside US 1996,ANDA040166,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,Bacillus cereus infection,Basic research
1,36653-82-4,https://drugs.ncats.io/drug/936JST6JCN,US Previously Marketed 1990,EXOSURF NEONATAL by GLAXOSMITHKLINE,https://www.accessdata.fda.gov/scripts/cder/da...,Possibly Marketed Outside US 1984,NU-DERM SUNFADER Skin Lightener with Sunscree...,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,"Dry, itchy skin; Respiratory distress syndrome...",Approved; Approved
2,112-92-5,https://drugs.ncats.io/drug/2KR89I4H1Y,US Previously Marketed,21 CFR 310.545(a)(18)(i)(B) skin protectant:w/...,https://www.gpo.gov/fdsys/pkg/CFR-2018-title21...,Possibly Marketed Outside US 1970,Ala Quin by Crown Laboratories,https://dailymed.nlm.nih.gov/dailymed/drugInfo...,SUNBURN,Approved


In [25]:
sample_output = result.to_csv("final.csv")