# Libraries

In [18]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

# Initial Dataframe Manipulations

In [19]:
sample_df = pd.read_csv("PUBCHEM_SYNONYMS_FLAGGED.csv")
sample_df = sample_df.head(10)

cas_df = sample_df[sample_df["CAS"] != "No CAS info"]
missing_df = sample_df[sample_df["CAS"] == "No CAS info"] 

In [20]:
cas_df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,Depositor Synonym List,Regular Synonym List,Synonym?
0,1-2-PROPANEDIOL_DIACETATE,"1,2 PROPANEDIOL DIACETATE",https://pubchem.ncbi.nlm.nih.gov/compound/12198,RELEVANT,623-84-7,"134236-23-0, 1432741-27-9, 1432741-27-9, 13423...",COMPOUND,YES,YES,YES
1,1-HEXADECANOL,1 HEXADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/2682,RELEVANT,"36653-82-4, 36311-34-9, 124-29-8","168679-13-8, 124-29-8, 55069-45-9, 8014-51-5, ...",COMPOUND,YES,YES,YES
2,1-OCTADECANOL,1 OCTADECANOL,https://pubchem.ncbi.nlm.nih.gov/compound/8221,RELEVANT,"112-92-5, 68911-61-5, 26762-44-7","193766-48-2, 8014-37-7, 8032-19-7, 8032-21-1, ...",COMPOUND,YES,YES,YES
4,2-OXOGLUTARIC_ACID,2 OXOGLUTARIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/51,FEATURED,"328-50-7, 34410-46-3, 17091-15-5",27175-99-1,COMPOUND,YES,YES,YES
6,2-PROPANOL,2 PROPANOL,https://pubchem.ncbi.nlm.nih.gov/compound/3776,FEATURED,67-63-0,8013-70-5,COMPOUND,YES,YES,YES
7,4-AMINOBUTYRIC_ACID,4 AMINOBUTYRIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/119,FEATURED,56-12-2,3131-86-0,COMPOUND,YES,YES,YES
9,7-OXO-DEHYDROEPIANDROSTERONE,7 OXO-DEHYDROEPIANDROSTERONE,https://pubchem.ncbi.nlm.nih.gov/compound/193313,RELEVANT,566-19-8,No CAS info,COMPOUND,YES,YES,YES


In [21]:
missing_df

Unnamed: 0,Original Molecule,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance,Depositor Synonym List,Regular Synonym List,Synonym?
3,2-3-(2-IODOPROPYLIDENEDIOXY)PROPANOL,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",No Link,Not Found,No CAS info,No CAS info,NEITHER,MISSING,MISSING,NO
5,2-PHENOXYETHANOL,2 PHENOXYETHANOL,https://pubchem.ncbi.nlm.nih.gov/compound/1797...,FEATURED,No CAS info,No CAS info,COMPOUND,NO,MISSING,NO
8,7-KETO_DEHYDRANDROSTERONE,7 KETO DEHYDRANDROSTERONE,No Link,Not Found,No CAS info,No CAS info,NEITHER,MISSING,MISSING,NO


### Helper Functions

In [22]:
def setup_webdriver():
    '''Initializes a headless selenium webdriver'''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def url_inxight(cas):
    '''Generates an Inxight URL for a given CAS'''
    return f"https://drugs.ncats.io/substances?q={cas}"

In [23]:
def get_inxight_url(molecule, driver):
    '''
    Parameters
    ----------
    A molecule's CAS and an initialised webdriver.

    Returns
    -------
    The top Inxight search result for a given CAS number
    '''
    
    url = "N/A"  # Default in case of failure

    driver.get(url_inxight(molecule))
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')

    except (NoSuchElementException, TimeoutException):
        pass  

    return molecule, url

def get_best_urls(cas_arr):
    '''
    Parameters
    ----------
    An array of CAS values

    Returns
    -------
    An array of associated top matches
    '''
    
    best_matches = []
    driver = setup_webdriver()  
    try:
        for cas in cas_arr:
            molecule, url = get_inxight_url(cas, driver)
            best_matches.append((molecule, url))
    finally:
        driver.quit()  

    return best_matches

In [24]:
get_best_urls(["1420477-60-6"])

[('1420477-60-6', 'https://drugs.ncats.io/drug/I42748ELQW')]

In [25]:
import requests

def get_additional_data(id):
    r = requests.get(f'https://drugs.ncats.io/api/v1/substances({id})/@additional')
    if 200 == r.status_code:
        return r.json()
    return None

data = get_additional_data('I42748ELQW')

In [26]:
def extract_conditions_and_phases(data):
    conditions_list = []
    highest_approval_list = []

    for entry in data:
        if entry['name'] == 'Conditions' and 'value' in entry:
            condition_info = entry['value']
            
            # Extract the condition name
            label = condition_info.get('label')
            if label:
                conditions_list.append(label)
            
            # Extract the highest phase of approval
            highest_phase = condition_info.get('highestPhase')
            if highest_phase:
                highest_approval_list.append(highest_phase)

    return conditions_list, highest_approval_list

In [27]:
extract_conditions_and_phases(data)

(['Chronic lymphocytic leukemia',
  'Rheumatoid arthritis',
  'Glioblastoma multiforme',
  'Mantle cell lymphoma',
  'Head and neck squamous cell carcinoma',
  'mantle cell lymphoma'],
 ['Phase III', 'Phase II', 'Phase II', 'Phase II', 'Phase II', 'Approved'])

In [28]:
def extract_event_details(data):
    event_details = {}
    
    for item in data:
        if 'value' in item and isinstance(item['value'], dict):  # Ensure 'value' is a dictionary
            details = item['value']
            if 'status' in details and 'year' in details:
                # Gather additional details
                source_id = details.get('sourceID', 'No Source ID')
                source_url = details.get('sourceURL', 'No Source URL')
                
                # Create a detailed string or dictionary as needed
                detail_info = {
                    'Status and Year': f"{details['status']} {details['year']}",
                    'Source ID': source_id,
                    'Source URL': source_url
                }
                
                # Use the 'name' of the event as the key in the dictionary
                event_details[item['name']] = detail_info
    
    return event_details

In [29]:
extract_event_details(data)

{'Highest Development Event': {'Status and Year': 'US Approved Rx 2022',
  'Source ID': 'NDA216387',
  'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=216387'},
 'Earliest Approved Event': {'Status and Year': 'US Approved Rx 2017',
  'Source ID': 'NDA210259',
  'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=210259'}}

In [11]:
simple = sample_df[['Parsed Molecule', 'CAS']]
simple

Unnamed: 0,Parsed Molecule,CAS
0,"1,2 PROPANEDIOL DIACETATE",623-84-7
1,1 HEXADECANOL,"36653-82-4, 36311-34-9, 124-29-8"
2,1 OCTADECANOL,"112-92-5, 68911-61-5, 26762-44-7"


In [12]:
def get_first_cas(cas_string):
    if cas_string == "No CAS info":
        return 
    return cas_string.split(',')[0].strip()

# Apply this function to the entire CAS column
simple['CAS'] = simple['CAS'].apply(get_first_cas)
cas_arr = simple['CAS'].tolist()
cas_arr

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  simple['CAS'] = simple['CAS'].apply(get_first_cas)


['623-84-7', '36653-82-4', '112-92-5']

In [13]:
links = get_best_urls(cas_arr)

In [14]:
links

[('623-84-7', 'https://drugs.ncats.io/drug/5Z492UNF9O'),
 ('36653-82-4', 'https://drugs.ncats.io/drug/936JST6JCN'),
 ('112-92-5', 'https://drugs.ncats.io/drug/2KR89I4H1Y')]

In [15]:
identifiers = [url.split('/')[-1] for _, url in links]

# Printing the result
print(identifiers)

['5Z492UNF9O', '936JST6JCN', '2KR89I4H1Y']


In [16]:
def extract_events(identifiers):
    data = []
    events = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        events.append(extract_event_details(data))

    return events
        
extract_events(identifiers)

[{'Highest Development Event': {'Status and Year': 'Possibly Marketed Outside US 1996',
   'Source ID': 'ANDA040166',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'},
  'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1996',
   'Source ID': 'ANDA040166',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c0ec47b7-c13a-4fa0-91fe-d7a03c70aec9'}},
 {'Highest Development Event': {'Status and Year': 'US Previously Marketed 1990',
   'Source ID': 'EXOSURF NEONATAL by GLAXOSMITHKLINE',
   'Source URL': 'https://www.accessdata.fda.gov/scripts/cder/daf/index.cfm?event=overview.process&ApplNo=020044'},
  'Earliest Approved Event': {'Status and Year': 'Possibly Marketed Outside US 1984',
   'Source ID': 'NU-DERM SUNFADER  Skin Lightener with Sunscreen (SPF 15) PABA FREE by OMP, INC.',
   'Source URL': 'https://dailymed.nlm.nih.gov/dailymed/drugInfo.cfm?setid=c67b0629-5529-412d-99e

In [17]:
def extract_conditions(identifiers):
    conditions = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        print(extract_conditions_and_phases(data))
        
extract_conditions(identifiers)

(['Bacillus cereus infection'], ['Basic research'])
(['Dry, itchy skin', 'Respiratory distress syndrome, neonatal'], ['Approved', 'Approved'])
(['SUNBURN'], ['Approved'])
