# Libraries

In [1]:
import selenium 
import csv
import re
import pandas as pd
import ast
import numpy as np
import math

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

# Parser
- Convert original non-queryable names to queryable.

In [2]:
molecules_df = pd.read_csv('task_1_no_pub.csv')
molecules_arr = molecules_df['preferred'].tolist()
common_arr = molecules_df['common'].tolist()

common_arr = [ast.literal_eval(s) for s in common_arr]

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []
acc = 0

for molecule in molecules_arr:
    if isinstance(molecule, float) and (np.isnan(molecule) or math.isnan(molecule)):
        if common_arr[acc] == []:
            molecule_str = 'missing'
        else:
            molecule_str = common_arr[acc][0]
    
    else:
        molecule_str = molecule
    
    updated_molecule = pattern.sub(r'\1,\2', molecule_str)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)
    
    acc += 1

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,RADIUM RA-223 CATION
1,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE
2,DROPERIDOL LACTATE
3,FISH OIL
4,OMEGA-3 FATTY ACIDS
...,...
1127,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...
1128,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...
1129,NINTEDANIB ESYLATE HEMIHYDRATE
1130,INDIUM IN-111 IBRITUMOMAB TIUXETAN


# PubChem Query Result Scraper

In [3]:
def get_pubchem_url(chemical):
    return f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'

def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_best_match(molecule, driver):
    url = "N/A"  # Default in case of failure
    result_type = "Not Found"  # Default result type

    driver.get(get_pubchem_url(molecule))
    try:
        WebDriverWait(driver, 6).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')
            action = element.get_attribute('data-action')
            if action == 'featured-result-link':
                result_type = 'FEATURED'
            else:
                result_type = 'RELEVANT'
    except (NoSuchElementException, TimeoutException):
        pass  # URL remains "N/A" and result_type remains "Not Found" if no match is found or if there's a timeout

    return molecule, url, result_type

def get_best_matches(parsed_molecules):
    best_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once
    try:
        for parsed_molecule in parsed_molecules:
            molecule, url, result_type = get_best_match(parsed_molecule, driver)
            best_matches.append((molecule, url, result_type))
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return best_matches

In [4]:
best_matches = get_best_matches(updated_molecules_arr)

In [5]:
df = pd.DataFrame(best_matches, columns=['Molecule', 'Link', 'Result Type'])

In [6]:
df

Unnamed: 0,Molecule,Link,Result Type
0,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED
1,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED
2,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED
3,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED
4,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED
...,...,...,...
1127,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT
1128,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT
1129,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED
1130,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT


In [7]:
# df_merged = pd.concat([molecules_df, df], ignore_index=True, sort=False, axis=1)
df_merged = df
df_merged.columns = ['Parsed', 'Link', 'Result Type']
df_merged

Unnamed: 0,Parsed,Link,Result Type
0,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED
1,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED
2,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED
3,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED
4,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED
...,...,...,...
1127,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT
1128,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT
1129,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED
1130,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT


In [8]:
df_merged_csv = df_merged.to_csv('FINAL_MOLECULE_LINKS.csv', index = False) 

In [9]:
links_arr = df_merged["Link"].tolist()
links_arr

['https://pubchem.ncbi.nlm.nih.gov/compound/Radium%20RA-223%20cation',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Ferumoxytol%20Non-Stoichiometric%20Magnetite',
 'https://pubchem.ncbi.nlm.nih.gov/compound/9956314',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Fish%20Oil',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Omega-3%20Fatty%20Acids',
 'https://pubchem.ncbi.nlm.nih.gov/substance/483933599',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Interferon%20Alfa-N3',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Hepatitis%20B%20Virus%20HBsAg%20Surface%20Protein%20Antigen',
 'https://pubchem.ncbi.nlm.nih.gov/substance/481101625',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Anti-Inhibitor%20Coagulant%20Complex',
 'https://pubchem.ncbi.nlm.nih.gov/substance/472385917',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Estrogens,%20Esterified',
 'https://pubchem.ncbi.nlm.nih.gov/compound/23725082',
 'https://pubchem.ncbi.nlm.nih.gov/compound/702',
 'https://pubchem.ncbi.nlm.nih.gov/compound/Acacia%20Po

# PubChem CAS Scraper

In [10]:
def get_cas_numbers(link, driver):
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        driver.get(link)
        # Wait and locate the CAS number section
        try:
            cas_elements = WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#CAS div.break-words')))
            cas_numbers["CAS"] = ', '.join([el.text.strip() for el in cas_elements])
        except (NoSuchElementException, TimeoutException):
            cas_numbers["CAS"] = "N/A"  # CAS number not found

        # Wait and locate the Deprecated CAS number section, if present
        try:
            deprecated_cas_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')
            # deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
            cas_numbers["Deprecated CAS"] = ', '.join([el.text.strip() for el in deprecated_cas_elements])
            
        except (NoSuchElementException, TimeoutException):
            cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
            
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for link in links:
            cas_matches.append(get_cas_numbers(link, driver))

    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return cas_matches      

In [11]:
cas_numbers = get_cas_numbers_concurrently(links_arr)

Error retrieving CAS numbers for N/A: Message: invalid argument
  (Session info: chrome-headless-shell=125.0.6422.142)
Stacktrace:
0   chromedriver                        0x000000010b9596b8 chromedriver + 6149816
1   chromedriver                        0x000000010b950cda chromedriver + 6114522
2   chromedriver                        0x000000010b3ddb91 chromedriver + 400273
3   chromedriver                        0x000000010b3c4098 chromedriver + 295064
4   chromedriver                        0x000000010b3c2c4f chromedriver + 289871
5   chromedriver                        0x000000010b3c2f3a chromedriver + 290618
6   chromedriver                        0x000000010b3e07b7 chromedriver + 411575
7   chromedriver                        0x000000010b46dda5 chromedriver + 990629
8   chromedriver                        0x000000010b44dcb2 chromedriver + 859314
9   chromedriver                        0x000000010b46d0db chromedriver + 987355
10  chromedriver                        0x000000010b44da5

In [12]:
df_cas = pd.DataFrame(cas_numbers, columns=['CAS', 'Deprecated CAS'])
df_cas

Unnamed: 0,CAS,Deprecated CAS
0,,
1,,
2,,
3,8016-13-5,
4,329042-31-1,
...,...,...
1127,,
1128,,
1129,959762-24-4,
1130,,


# Combining Molecule Links Dataframe with CAS Dataframe

In [13]:
final_df = pd.concat([df_merged, df_cas], ignore_index=True, sort=False, axis=1)

In [14]:
final_df

Unnamed: 0,0,1,2,3,4
0,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED,,
1,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED,,
2,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED,,
3,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED,8016-13-5,
4,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED,329042-31-1,
...,...,...,...,...,...
1127,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,
1128,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT,,
1129,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED,959762-24-4,
1130,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT,,


In [15]:
final_df = pd.concat([molecules_df, final_df], ignore_index=True, sort=False, axis=1)
final_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
0,https://drugs.ncats.io/drug/9H414A99MD,"['RADIUM RA-223 CATION', 'RADIUM, ISOTOPE OF M...",RADIUM RA-223 CATION,[],,,,,Algeta,ACHIRAL,...,"['Prostatic Neoplasms, Castration-Resistant', ...","['CA2875918A1', 'EP2836837A2', 'WO2008092954A2']",['https://www.ncbi.nlm.nih.gov/pubmed/18368304...,"['18368304', '23000088', '23653243']",,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED,,
1,https://drugs.ncats.io/drug/CLH5FT6412,['FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE'],FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,[],,,,,,ACHIRAL,...,[],[],[],[],,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED,,
2,https://drugs.ncats.io/drug/09NO5N37E0,"['DROPERIDOL LACTATE', 'DROPERIDOL LACTATE [WH...",DROPERIDOL LACTATE,[],,,,,Janssen,RACEMIC,...,['Postoperative Nausea and Vomiting'],['JP2002511777A'],['https://www.ncbi.nlm.nih.gov/pubmed/1167743'...,"['1167743', '1147302', '10411778', '11075569',...",,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED,,
3,https://drugs.ncats.io/drug/XGF7L72M0F,"['FISH OIL [WHO-DD]', 'FISH OIL [INCI]', 'FISH...",FISH OIL,['OMEGAVEN'],,,C10AX06,,,,...,[],[],[],[],D005395,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED,8016-13-5,
4,https://drugs.ncats.io/drug/71M78END5S,"['OMEGA-3 FATTY ACIDS', 'OMEGA-3 POLYUNSATURAT...",OMEGA-3 FATTY ACIDS,[],,,,,,,...,[],[],[],[],D010743,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED,329042-31-1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,https://drugs.ncats.io/drug/CX39D2R810,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,
1128,https://drugs.ncats.io/drug/5I5HVC73I8,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT,,
1129,https://drugs.ncats.io/drug/23O68KB9KV,['NINTEDANIB ESYLATE HEMIHYDRATE'],NINTEDANIB ESYLATE HEMIHYDRATE,[],,,,,Boehringer Ingelheim,,...,['Idiopathic Pulmonary Fibrosis'],"['20040176392', '7119093', 'JP2006524634A', 'J...",['https://www.ncbi.nlm.nih.gov/pubmed/21204634...,"['21204634', '26063212', '25862013']",,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED,959762-24-4,
1130,https://drugs.ncats.io/drug/S9U4ZR2W8V,"['INDIUM IN-111 IBRITUMOMAB TIUXETAN', 'IBRITU...",INDIUM IN-111 IBRITUMOMAB TIUXETAN,"['ZEVALIN INDIUM IN-111', '111INDIUM-ZEVALIN']",,,,,,,...,['Radiolabeled Immunoscintigraphy'],[],[],[],,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT,,


In [16]:
final_df.columns=['url', 'common', 'preferred', 'brand', 'inn', 'pubchem', 'who_atc', 'cas', 'originator', 'stereochemistry', 'formula', 'mw', 'investigational', 'us_market', 'us_market_year', 'us_prev_market', 'first_year_approval', 'fda_links', 'target_list', 'condition_list', 'patent_list', 'pubmed', 'pmid', 'mesh', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS']
# final_df.to_csv('margaret_missing_cas._identified.csv', index=False)

# Flagging Compound Types

In [17]:
final_df['Compound/Substance'] = final_df['Link'].apply(lambda x: 'COMPOUND' if 'compound' in str(x) else ('SUBSTANCE' if 'substance' in str(x) else 'N/A'))

In [18]:
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,https://drugs.ncats.io/drug/9H414A99MD,"['RADIUM RA-223 CATION', 'RADIUM, ISOTOPE OF M...",RADIUM RA-223 CATION,[],,,,,Algeta,ACHIRAL,...,"['CA2875918A1', 'EP2836837A2', 'WO2008092954A2']",['https://www.ncbi.nlm.nih.gov/pubmed/18368304...,"['18368304', '23000088', '23653243']",,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED,,,COMPOUND
1,https://drugs.ncats.io/drug/CLH5FT6412,['FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE'],FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,[],,,,,,ACHIRAL,...,[],[],[],,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED,,,COMPOUND
2,https://drugs.ncats.io/drug/09NO5N37E0,"['DROPERIDOL LACTATE', 'DROPERIDOL LACTATE [WH...",DROPERIDOL LACTATE,[],,,,,Janssen,RACEMIC,...,['JP2002511777A'],['https://www.ncbi.nlm.nih.gov/pubmed/1167743'...,"['1167743', '1147302', '10411778', '11075569',...",,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED,,,COMPOUND
3,https://drugs.ncats.io/drug/XGF7L72M0F,"['FISH OIL [WHO-DD]', 'FISH OIL [INCI]', 'FISH...",FISH OIL,['OMEGAVEN'],,,C10AX06,,,,...,[],[],[],D005395,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED,8016-13-5,,COMPOUND
4,https://drugs.ncats.io/drug/71M78END5S,"['OMEGA-3 FATTY ACIDS', 'OMEGA-3 POLYUNSATURAT...",OMEGA-3 FATTY ACIDS,[],,,,,,,...,[],[],[],D010743,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED,329042-31-1,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,https://drugs.ncats.io/drug/CX39D2R810,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,,SUBSTANCE
1128,https://drugs.ncats.io/drug/5I5HVC73I8,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT,,,SUBSTANCE
1129,https://drugs.ncats.io/drug/23O68KB9KV,['NINTEDANIB ESYLATE HEMIHYDRATE'],NINTEDANIB ESYLATE HEMIHYDRATE,[],,,,,Boehringer Ingelheim,,...,"['20040176392', '7119093', 'JP2006524634A', 'J...",['https://www.ncbi.nlm.nih.gov/pubmed/21204634...,"['21204634', '26063212', '25862013']",,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED,959762-24-4,,COMPOUND
1130,https://drugs.ncats.io/drug/S9U4ZR2W8V,"['INDIUM IN-111 IBRITUMOMAB TIUXETAN', 'IBRITU...",INDIUM IN-111 IBRITUMOMAB TIUXETAN,"['ZEVALIN INDIUM IN-111', '111INDIUM-ZEVALIN']",,,,,,,...,[],[],[],,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT,,,SUBSTANCE


In [19]:
# final_df['Original Molecule'] = final_df['Parsed Molecule']
# final_df = final_df[['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS', 'Compound/Substance']]
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,https://drugs.ncats.io/drug/9H414A99MD,"['RADIUM RA-223 CATION', 'RADIUM, ISOTOPE OF M...",RADIUM RA-223 CATION,[],,,,,Algeta,ACHIRAL,...,"['CA2875918A1', 'EP2836837A2', 'WO2008092954A2']",['https://www.ncbi.nlm.nih.gov/pubmed/18368304...,"['18368304', '23000088', '23653243']",,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED,,,COMPOUND
1,https://drugs.ncats.io/drug/CLH5FT6412,['FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE'],FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,[],,,,,,ACHIRAL,...,[],[],[],,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED,,,COMPOUND
2,https://drugs.ncats.io/drug/09NO5N37E0,"['DROPERIDOL LACTATE', 'DROPERIDOL LACTATE [WH...",DROPERIDOL LACTATE,[],,,,,Janssen,RACEMIC,...,['JP2002511777A'],['https://www.ncbi.nlm.nih.gov/pubmed/1167743'...,"['1167743', '1147302', '10411778', '11075569',...",,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED,,,COMPOUND
3,https://drugs.ncats.io/drug/XGF7L72M0F,"['FISH OIL [WHO-DD]', 'FISH OIL [INCI]', 'FISH...",FISH OIL,['OMEGAVEN'],,,C10AX06,,,,...,[],[],[],D005395,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED,8016-13-5,,COMPOUND
4,https://drugs.ncats.io/drug/71M78END5S,"['OMEGA-3 FATTY ACIDS', 'OMEGA-3 POLYUNSATURAT...",OMEGA-3 FATTY ACIDS,[],,,,,,,...,[],[],[],D010743,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED,329042-31-1,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,https://drugs.ncats.io/drug/CX39D2R810,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,,SUBSTANCE
1128,https://drugs.ncats.io/drug/5I5HVC73I8,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT,,,SUBSTANCE
1129,https://drugs.ncats.io/drug/23O68KB9KV,['NINTEDANIB ESYLATE HEMIHYDRATE'],NINTEDANIB ESYLATE HEMIHYDRATE,[],,,,,Boehringer Ingelheim,,...,"['20040176392', '7119093', 'JP2006524634A', 'J...",['https://www.ncbi.nlm.nih.gov/pubmed/21204634...,"['21204634', '26063212', '25862013']",,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED,959762-24-4,,COMPOUND
1130,https://drugs.ncats.io/drug/S9U4ZR2W8V,"['INDIUM IN-111 IBRITUMOMAB TIUXETAN', 'IBRITU...",INDIUM IN-111 IBRITUMOMAB TIUXETAN,"['ZEVALIN INDIUM IN-111', '111INDIUM-ZEVALIN']",,,,,,,...,[],[],[],,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT,,,SUBSTANCE


# Export Dataframe as CSV

In [20]:
# final_df_csv = final_df.to_csv('pubchem_longer_wait.csv', index = False) 
final_df.to_csv('task_1_no_pub_identified.csv', index=False)

In [21]:
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,https://drugs.ncats.io/drug/9H414A99MD,"['RADIUM RA-223 CATION', 'RADIUM, ISOTOPE OF M...",RADIUM RA-223 CATION,[],,,,,Algeta,ACHIRAL,...,"['CA2875918A1', 'EP2836837A2', 'WO2008092954A2']",['https://www.ncbi.nlm.nih.gov/pubmed/18368304...,"['18368304', '23000088', '23653243']",,RADIUM RA-223 CATION,https://pubchem.ncbi.nlm.nih.gov/compound/Radi...,FEATURED,,,COMPOUND
1,https://drugs.ncats.io/drug/CLH5FT6412,['FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE'],FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,[],,,,,,ACHIRAL,...,[],[],[],,FERUMOXYTOL NON-STOICHIOMETRIC MAGNETITE,https://pubchem.ncbi.nlm.nih.gov/compound/Feru...,FEATURED,,,COMPOUND
2,https://drugs.ncats.io/drug/09NO5N37E0,"['DROPERIDOL LACTATE', 'DROPERIDOL LACTATE [WH...",DROPERIDOL LACTATE,[],,,,,Janssen,RACEMIC,...,['JP2002511777A'],['https://www.ncbi.nlm.nih.gov/pubmed/1167743'...,"['1167743', '1147302', '10411778', '11075569',...",,DROPERIDOL LACTATE,https://pubchem.ncbi.nlm.nih.gov/compound/9956314,FEATURED,,,COMPOUND
3,https://drugs.ncats.io/drug/XGF7L72M0F,"['FISH OIL [WHO-DD]', 'FISH OIL [INCI]', 'FISH...",FISH OIL,['OMEGAVEN'],,,C10AX06,,,,...,[],[],[],D005395,FISH OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Fish...,FEATURED,8016-13-5,,COMPOUND
4,https://drugs.ncats.io/drug/71M78END5S,"['OMEGA-3 FATTY ACIDS', 'OMEGA-3 POLYUNSATURAT...",OMEGA-3 FATTY ACIDS,[],,,,,,,...,[],[],[],D010743,OMEGA-3 FATTY ACIDS,https://pubchem.ncbi.nlm.nih.gov/compound/Omeg...,FEATURED,329042-31-1,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,https://drugs.ncats.io/drug/CX39D2R810,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/483...,RELEVANT,,,SUBSTANCE
1128,https://drugs.ncats.io/drug/5I5HVC73I8,['INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)...,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,[],,,,,,,...,[],[],[],,INFLUENZA A VIRUS A/CALIFORNIA/7/2009 (H1N1)-L...,https://pubchem.ncbi.nlm.nih.gov/substance/347...,RELEVANT,,,SUBSTANCE
1129,https://drugs.ncats.io/drug/23O68KB9KV,['NINTEDANIB ESYLATE HEMIHYDRATE'],NINTEDANIB ESYLATE HEMIHYDRATE,[],,,,,Boehringer Ingelheim,,...,"['20040176392', '7119093', 'JP2006524634A', 'J...",['https://www.ncbi.nlm.nih.gov/pubmed/21204634...,"['21204634', '26063212', '25862013']",,NINTEDANIB ESYLATE HEMIHYDRATE,https://pubchem.ncbi.nlm.nih.gov/compound/1626...,FEATURED,959762-24-4,,COMPOUND
1130,https://drugs.ncats.io/drug/S9U4ZR2W8V,"['INDIUM IN-111 IBRITUMOMAB TIUXETAN', 'IBRITU...",INDIUM IN-111 IBRITUMOMAB TIUXETAN,"['ZEVALIN INDIUM IN-111', '111INDIUM-ZEVALIN']",,,,,,,...,[],[],[],,INDIUM IN-111 IBRITUMOMAB TIUXETAN,https://pubchem.ncbi.nlm.nih.gov/substance/472...,RELEVANT,,,SUBSTANCE
