In [1]:
import selenium 
import csv
import re
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

In [2]:
sample_df = pd.read_csv("PUBCHEM_SYNONYMS_FLAGGED.csv")
sample_df = sample_df.head(70)

cas_df = sample_df[sample_df["CAS"] != "No CAS info"]
missing_df = sample_df[sample_df["CAS"] == "No CAS info"] 

In [3]:
cas_df = cas_df[['Parsed Molecule', 'CAS']]
missing_df = missing_df[['Parsed Molecule', 'CAS']]

In [4]:
cas_df

Unnamed: 0,Parsed Molecule,CAS
0,"1,2 PROPANEDIOL DIACETATE",623-84-7
1,1 HEXADECANOL,"36653-82-4, 36311-34-9, 124-29-8"
2,1 OCTADECANOL,"112-92-5, 68911-61-5, 26762-44-7"
4,2 OXOGLUTARIC ACID,"328-50-7, 34410-46-3, 17091-15-5"
6,2 PROPANOL,67-63-0
7,4 AMINOBUTYRIC ACID,56-12-2
9,7 OXO-DEHYDROEPIANDROSTERONE,566-19-8
10,8 QUINOLINOL,148-24-3
11,ABACAVIR,"136470-78-5, 188062-50-2"
12,ABALOPARATIDE,247062-33-5


In [5]:
missing_df

Unnamed: 0,Parsed Molecule,CAS
3,"2,3-(2 IODOPROPYLIDENEDIOXY)PROPANOL",No CAS info
5,2 PHENOXYETHANOL,No CAS info
8,7 KETO DEHYDRANDROSTERONE,No CAS info
18,ABRUS PRECATORIUS,No CAS info
19,ABSORBABLE GELATIN/COLLAGEN SPONGE,No CAS info
20,ACACIA CATECHU,No CAS info
21,ACACIA SENEGAL,No CAS info
34,ACETONE (KETONE) TESTS,No CAS info
44,ACONITUM NAPELLUS,No CAS info
53,ADRENAL CORTEX,No CAS info


In [6]:
def get_first_cas(cas_string):
    # Replace all semicolons with commas
    modified_string = re.sub(r';', ',', cas_string)
    
    if modified_string == "No CAS info":
        return None
    
    return modified_string.split(',')[0].strip()

# Apply this function to the entire CAS column
cas_df['CAS'] = cas_df['CAS'].apply(get_first_cas)

molecules = list(zip(cas_df['Parsed Molecule'], cas_df['CAS']))
molecules

# missing_df['CAS'] = missing_df['CAS'].apply(get_first_cas)
# missing_arr = missing_df['CAS'].tolist()

[('1,2 PROPANEDIOL DIACETATE', '623-84-7'),
 ('1 HEXADECANOL', '36653-82-4'),
 ('1 OCTADECANOL', '112-92-5'),
 ('2 OXOGLUTARIC ACID', '328-50-7'),
 ('2 PROPANOL', '67-63-0'),
 ('4 AMINOBUTYRIC ACID', '56-12-2'),
 ('7 OXO-DEHYDROEPIANDROSTERONE', '566-19-8'),
 ('8 QUINOLINOL', '148-24-3'),
 ('ABACAVIR', '136470-78-5'),
 ('ABALOPARATIDE', '247062-33-5'),
 ('ABARELIX', '183552-38-7'),
 ('ABATACEPT', '332348-12-6'),
 ('ABCIXIMAB', '143653-53-6'),
 ('ABEMACICLIB', '1231929-97-7'),
 ('ABIRATERONE ACETATE', '154229-18-2'),
 ('ACALABRUTINIB', '1420477-60-6'),
 ('ACAMPROSATE', '77337-76-9'),
 ('ACARBOSE', '56180-94-0'),
 ('ACEBUTOLOL', '37517-30-9'),
 ('ACEMANNAN', '110042-95-0'),
 ('ACEPROMAZINE', '61-00-7'),
 ('ACESULFAME', '33665-90-6'),
 ('ACETAZOLAMIDE', '59-66-5'),
 ('ACETIC ACID', '64-19-7'),
 ('ACETOHEXAMIDE', '968-81-0'),
 ('ACETOHYDROXAMIC ACID', '546-88-3'),
 ('ACETONE', '67-64-1'),
 ('ACETYL-L-CARNITINE', '3040-38-8'),
 ('ACETYLCHOLINE', '51-84-3'),
 ('ACETYLCYSTEINE', '616-91-1'),


In [7]:
import urllib

def setup_webdriver():
    '''Initializes a headless selenium webdriver'''
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def url_inxight(molecule):
    '''Generates an Inxight URL for a given CAS'''
    name = molecule[0]
    cas = molecule[1]
    # Replace dashes with %2D
    cas_formatted = cas.replace('-', '%20')
    # URL encode the name and add the regular expression pattern match for names starting with 'name'
    name_encoded = urllib.parse.quote(f'^{name}$')
    # Construct the URL with the encoded components
    return f'https://drugs.ncats.io/substances?q=(root_codes_CAS:\"{cas_formatted}\")%20AND%20(root_names_name:\"{name_encoded}\")'

In [8]:
test = molecules[0]
url_inxight(test)

'https://drugs.ncats.io/substances?q=(root_codes_CAS:"623%2084%207")%20AND%20(root_names_name:"%5E1%2C2%20PROPANEDIOL%20DIACETATE%24")'

In [None]:
def get_inxight_url(molecule, driver):
    '''
    Parameters
    ----------
    A molecule's CAS and an initialised webdriver.

    Returns
    -------
    The top Inxight search result for a given CAS number
    '''
    
    url = "N/A"  # Default in case of failure

    driver.get(url_inxight(molecule))
    try:
        WebDriverWait(driver, 3).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[id="card-title"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[id="card-title"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')

    except (NoSuchElementException, TimeoutException):
        pass  

    return cas, name, url

def get_best_urls(mol_arr):
    '''
    Parameters
    ----------
    An array of CAS values

    Returns
    -------
    An array of associated top matches
    '''
    
    best_matches = []
    driver = setup_webdriver()  
    try:
        for molecule in mol_arr:
            molecule, url = get_inxight_url(cas, driver)
            best_matches.append((molecule, url))
    finally:
        driver.quit()  

    return best_matches

In [None]:
import requests

def get_additional_data(id):
    r = requests.get(f'https://drugs.ncats.io/api/v1/substances({id})/@additional')
    if 200 == r.status_code:
        return r.json()
    return None

In [None]:
def extract_conditions_and_phases(data):
    conditions_list = []
    highest_approval_list = []

    for entry in data:
        if entry['name'] == 'Conditions' and 'value' in entry:
            condition_info = entry['value']
            
            # Extract the condition name
            label = condition_info.get('label')
            if label:
                conditions_list.append(label)
            
            # Extract the highest phase of approval
            highest_phase = condition_info.get('highestPhase')
            if highest_phase:
                highest_approval_list.append(highest_phase)

    return conditions_list, highest_approval_list

In [None]:
def extract_event_details(data):
    event_details = {}
    
    for item in data:
        if 'value' in item and isinstance(item['value'], dict):  # Ensure 'value' is a dictionary
            details = item['value']
            if 'status' in details and 'sourceID' in details:
                if item['name'] == 'Highest Development Event' or item['name'] == 'Earliest Approved Event':
                    # Gather additional details
                    source_id = details.get('sourceID', 'No Source ID')
                    source_url = details.get('sourceURL', 'No Source URL')
                    
                    detail_info = {
                        'Status and Year': f"{details['status']} {details.get('year', '')}",
                        'Source ID': source_id,
                        'Source URL': source_url
                    }
                    
                    # Use the 'name' of the event as the key in the dictionary
                    event_details[item['name']] = detail_info
    
    return event_details

# def extract_event_details(data):
#     event_details = {}
#     if data is None:  # Check if data is None before iterating
#         return event_details

#     for item in data:
#         if 'value' in item and isinstance(item['value'], dict):
#             details = item['value']
#             if 'status' in details and 'sourceID' in details:
#                 if item['name'] in ['Highest Development Event', 'Earliest Approved Event']:
#                     detail_info = {
#                         'Status and Year': f"{details['status']} {details.get('year', '')}",
#                         'Source ID': details.get('sourceID', 'No Source ID'),
#                         'Source URL': details.get('sourceURL', 'No Source URL')
#                     }
#                     event_details[item['name']] = detail_info
#     return event_details

In [None]:
links = get_best_urls(cas_arr)
links

In [None]:
url_inxight("9005-49-6")

In [None]:
identifiers = [url.split('/')[-1] for _, url in links]
identifiers

In [None]:
identifiers = [identifier if identifier != 'A' else 'MISSING' for identifier in identifiers]

In [None]:
def extract_events(identifiers):
    data = []
    events = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        print(extract_event_details(data))
        events.append(extract_event_details(data))

    return events
        
events = extract_events(identifiers)

# def extract_events(identifiers):
#     events = []
#     for identifier in identifiers:
#         data = get_additional_data(identifier)
#         if data is not None:  # Check if data is None
#             event_details = extract_event_details(data)
#             events.append(event_details)
#         else:
#             print(f"No data available for identifier {identifier}")  # Or handle it differently
#     return events

In [None]:
def extract_conditions(identifiers):
    conditions = []

    for identifier in identifiers:
        data = get_additional_data(identifier)
        conditions.append(extract_conditions_and_phases(data))

    return conditions
        
conditions_arr = extract_conditions(identifiers)

In [None]:
events

In [None]:
conditions_arr

In [None]:
data = []
for event in events:
    record = {}
    for key, value in event.items():
        for sub_key, sub_value in value.items():
            record[f"{key} {sub_key}"] = sub_value
    data.append(record)

df = pd.DataFrame(data)

In [None]:
df

In [None]:
rows = [{'conditions': conditions, 'phases': phases} for conditions, phases in conditions_arr]

# Create DataFrame
conditions_df = pd.DataFrame(rows)
conditions_df['conditions'] = conditions_df['conditions'].apply(lambda x: '; '.join(x))
conditions_df['phases'] = conditions_df['phases'].apply(lambda x: '; '.join(x))
conditions_df

In [None]:
merged = pd.concat([df, conditions_df], axis=1)
merged

In [None]:
links_df = pd.DataFrame(links, columns=['CAS', 'URL'])
links_df

In [None]:
result = pd.concat([links_df, merged], axis=1)
result

In [None]:
sample_output = result.to_csv("final.csv")