# Libraries

In [1]:
import selenium 
import csv
import re
import pandas as pd
import ast
import numpy as np
import math

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.options import Options

import time

# Parser
- Convert original non-queryable names to queryable.

In [2]:
molecules_df = pd.read_csv('unmatched_valid_margaret.csv')
molecules_df = molecules_df[molecules_df['pubchem'].notna()]
molecules_arr = molecules_df['preferred'].tolist()
common_arr = molecules_df['common'].tolist()

common_arr = [ast.literal_eval(s) for s in common_arr]

pattern = re.compile(r'(\d)-(\d)')
pattern2 = re.compile(r'_')
pattern3 = re.compile(r'(\d)-([a-zA-Z])')

# List to store updated molecule names
updated_molecules_arr = []
acc = 0

for molecule in molecules_arr:
    if isinstance(molecule, float) and (np.isnan(molecule) or math.isnan(molecule)):
        if common_arr[acc] == []:
            molecule_str = 'missing'
        else:
            molecule_str = common_arr[acc][0]
    
    else:
        molecule_str = molecule
    
    updated_molecule = pattern.sub(r'\1,\2', molecule_str)
    updated_molecule = pattern2.sub(r' ', updated_molecule)
    updated_molecule = pattern3.sub(r'\1 \2', updated_molecule)
    updated_molecules_arr.append(updated_molecule)
    
    acc += 1

updated_molecules_df = pd.DataFrame(updated_molecules_arr, columns=['updated_molecule'])
updated_molecules_df

Unnamed: 0,updated_molecule
0,LACTITOL
1,RIMEGEPANT
2,BEMPEDOIC ACID
3,AMISULPRIDE
4,REMIMAZOLAM
...,...
3644,CALCIUM ALGINATE
3645,POLYBENZARSOL
3646,HEXADIMETHRINE BROMIDE
3647,PEPPERMINT OIL


# PubChem Query Result Scraper

In [3]:
def get_pubchem_url(chemical):
    return f'https://pubchem.ncbi.nlm.nih.gov/#query={chemical}'
def setup_webdriver():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver

def get_best_match(molecule, driver):
    url = "N/A"  # Default in case of failure
    result_type = "Not Found"  # Default result type

    driver.get(get_pubchem_url(molecule))
    try:
        WebDriverWait(driver, 60).until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')))
        elements = driver.find_elements(By.CSS_SELECTOR, 'a[data-action="featured-result-link"], a[data-action="result-link"]')
        if elements:
            element = elements[0]
            url = element.get_attribute('href')
            action = element.get_attribute('data-action')
            if action == 'featured-result-link':
                result_type = 'FEATURED'
            else:
                result_type = 'RELEVANT'
    except (NoSuchElementException, TimeoutException):
        pass  # URL remains "N/A" and result_type remains "Not Found" if no match is found or if there's a timeout

    return molecule, url, result_type

def get_best_matches(parsed_molecules):
    best_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once
    try:
        for parsed_molecule in parsed_molecules:
            molecule, url, result_type = get_best_match(parsed_molecule, driver)
            best_matches.append((molecule, url, result_type))
    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return best_matches

In [4]:
best_matches = get_best_matches(updated_molecules_arr)

In [5]:
df = pd.DataFrame(best_matches, columns=['Molecule', 'Link', 'Result Type'])

In [6]:
df

Unnamed: 0,Molecule,Link,Result Type
0,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED
1,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED
2,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED
3,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED
4,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED
...,...,...,...
3644,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED
3645,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED
3646,HEXADIMETHRINE BROMIDE,https://pubchem.ncbi.nlm.nih.gov/compound/Hexa...,FEATURED
3647,PEPPERMINT OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Pepp...,FEATURED


In [7]:
# df_merged = pd.concat([molecules_df, df], ignore_index=True, sort=False, axis=1)
df_merged = df
df_merged.columns = ['Parsed', 'Link', 'Result Type']
df_merged

Unnamed: 0,Parsed,Link,Result Type
0,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED
1,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED
2,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED
3,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED
4,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED
...,...,...,...
3644,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED
3645,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED
3646,HEXADIMETHRINE BROMIDE,https://pubchem.ncbi.nlm.nih.gov/compound/Hexa...,FEATURED
3647,PEPPERMINT OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Pepp...,FEATURED


In [8]:
df_merged_csv = df_merged.to_csv('FINAL_MOLECULE_LINKS.csv', index = False) 

In [9]:
links_arr = df_merged["Link"].tolist()
links_arr

['https://pubchem.ncbi.nlm.nih.gov/compound/157355',
 'https://pubchem.ncbi.nlm.nih.gov/compound/51049968',
 'https://pubchem.ncbi.nlm.nih.gov/compound/10472693',
 'https://pubchem.ncbi.nlm.nih.gov/compound/2159',
 'https://pubchem.ncbi.nlm.nih.gov/compound/9867812',
 'https://pubchem.ncbi.nlm.nih.gov/compound/51039094',
 'https://pubchem.ncbi.nlm.nih.gov/compound/11319217',
 'https://pubchem.ncbi.nlm.nih.gov/compound/70957463',
 'https://pubchem.ncbi.nlm.nih.gov/compound/52938427',
 'https://pubchem.ncbi.nlm.nih.gov/compound/10869981',
 'https://pubchem.ncbi.nlm.nih.gov/compound/25145656',
 'https://pubchem.ncbi.nlm.nih.gov/compound/6917864',
 'https://pubchem.ncbi.nlm.nih.gov/compound/44139752',
 'https://pubchem.ncbi.nlm.nih.gov/compound/71584930',
 'https://pubchem.ncbi.nlm.nih.gov/compound/10127622',
 'https://pubchem.ncbi.nlm.nih.gov/compound/86705695',
 'https://pubchem.ncbi.nlm.nih.gov/compound/57327016',
 'https://pubchem.ncbi.nlm.nih.gov/compound/118023034',
 'https://pubchem

# PubChem CAS Scraper

In [10]:
def get_cas_numbers(link, driver):
    cas_numbers = {"CAS": "N/A", "Deprecated CAS": "N/A"}  # Default in case of failure

    try:
        driver.get(link)
        # Wait and locate the CAS number section
        try:
            cas_elements = WebDriverWait(driver, 60).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#CAS div.break-words')))
            cas_numbers["CAS"] = ', '.join([el.text.strip() for el in cas_elements])
        except (NoSuchElementException, TimeoutException):
            cas_numbers["CAS"] = "N/A"  # CAS number not found

        # Wait and locate the Deprecated CAS number section, if present
        try:
            deprecated_cas_elements = driver.find_elements(By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')
            # deprecated_cas_elements = WebDriverWait(driver, 3).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, 'section#Deprecated-CAS div.break-words')))
            cas_numbers["Deprecated CAS"] = ', '.join([el.text.strip() for el in deprecated_cas_elements])
            
        except (NoSuchElementException, TimeoutException):
            cas_numbers["Deprecated CAS"] = "N/A"  # Deprecated CAS number not found
            
    except Exception as e:
        print(f"Error retrieving CAS numbers for {link}: {e}")

    return cas_numbers

def get_cas_numbers_concurrently(links):
    cas_matches = []
    driver = setup_webdriver()  # Initialize the WebDriver once

    try:
        for link in links:
            cas_matches.append(get_cas_numbers(link, driver))

    finally:
        driver.quit()  # Make sure to quit the WebDriver

    return cas_matches      

In [11]:
cas_numbers = get_cas_numbers_concurrently(links_arr)

Error retrieving CAS numbers for N/A: Message: invalid argument
  (Session info: chrome-headless-shell=126.0.6478.126)
Stacktrace:
#0 0x5f203dc52c5a <unknown>
#1 0x5f203d935c71 <unknown>
#2 0x5f203d91c303 <unknown>
#3 0x5f203d91ab48 <unknown>
#4 0x5f203d91b1ca <unknown>
#5 0x5f203d938877 <unknown>
#6 0x5f203d9c4f25 <unknown>
#7 0x5f203d9a55c2 <unknown>
#8 0x5f203d9c4303 <unknown>
#9 0x5f203d9a5363 <unknown>
#10 0x5f203d975247 <unknown>
#11 0x5f203d975b9e <unknown>
#12 0x5f203dc1922b <unknown>
#13 0x5f203dc1d2d1 <unknown>
#14 0x5f203dc04ade <unknown>
#15 0x5f203dc1de32 <unknown>
#16 0x5f203dbe977f <unknown>
#17 0x5f203dc42618 <unknown>
#18 0x5f203dc427f0 <unknown>
#19 0x5f203dc51d8c <unknown>
#20 0x71bd4b894ac3 <unknown>

Error retrieving CAS numbers for N/A: Message: invalid argument
  (Session info: chrome-headless-shell=126.0.6478.126)
Stacktrace:
#0 0x5f203dc52c5a <unknown>
#1 0x5f203d935c71 <unknown>
#2 0x5f203d91c303 <unknown>
#3 0x5f203d91ab48 <unknown>
#4 0x5f203d91b1ca <unknown

In [12]:
df_cas = pd.DataFrame(cas_numbers, columns=['CAS', 'Deprecated CAS'])
df_cas

Unnamed: 0,CAS,Deprecated CAS
0,585-86-4,
1,"1289023-67-1, 1374024-48-2",
2,738606-46-7,
3,"71675-85-9, 53583-79-2",
4,308242-62-8,
...,...,...
3644,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2"
3645,54531-52-1,
3646,"28728-55-4, 9011-04-5","32036-84-3, 117848-85-8, 62766-74-9, 9011-04-5"
3647,8006-90-4,


# Combining Molecule Links Dataframe with CAS Dataframe

In [13]:
molecules_df

Unnamed: 0,cas,url,common,preferred,brand,inn,pubchem,who_atc,originator,stereochemistry,...,us_market_year,us_prev_market,first_year_approval,fda_links,target_list,condition_list,patent_list,pubmed,pmid,mesh
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,['2020'],0,2020.0,['NDA211281'],[],"['Constipation', 'Hepatic Encephalopathy']","['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,['2020'],0,2020.0,['NDA212728'],['Calcitonin gene-related peptide type 1 recep...,['Migraine Disorders'],"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,['2020'],0,2020.0,['NDA211617'],"['ATP-citrate synthase', 'AMPK alpha1$$alpha2']","['Hypercholesterolemia', 'Dyslipidemias']","['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,['2020'],0,2020.0,['NDA209510'],"['Dopamine D3 receptor', 'Serotonin 7 (5-HT7) ...","['Schizophrenia', 'Psychotic symptoms']","['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,['2020'],0,2020.0,['NDA212295'],['GABA-A receptor; benzodiazepine site'],[],[],[],[],C522201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,9005-35-0,https://drugs.ncats.io/drug/8P20S56HZI,"['ALGINATES: AMMONIUM, CALCIUM, POTASSIUM AND ...",CALCIUM ALGINATE,[],,44630049.0,B02BC08,"Moissan, H.",,...,,0,1921.0,[],[],"['Cystic Fibrosis', 'Hypocalcemia', 'Chronic O...","['20090030026', '20130174651', '3966773', '397...",['https://www.ncbi.nlm.nih.gov/pubmed/1220513'...,"['1220513', '11572466', '11513607', '11592964'...",
5012,9006-68-2,https://drugs.ncats.io/drug/8B8GHO5O27,"['POLYBENZARSOL [INN]', 'POLYBENZARSOL [MI]']",POLYBENZARSOL,['BENZODOL'],727.0,68716.0,,"Butlerov, A.",,...,,0,1921.0,[],['DNA'],['Warts'],"['3944600', '3954756', '3969409', '3976653', '...",['https://www.ncbi.nlm.nih.gov/pubmed/1597190'...,"['1597190', '14666255', '14656365', '14979077'...",
5014,9011-04-5,https://drugs.ncats.io/drug/4C905MSK4W,"['HEXADIMETHRINE BROMIDE [INN]', ""POLY(N,N,N',...",HEXADIMETHRINE BROMIDE,['POLYBRENE'],719.0,24769.0,,,,...,,1,1959.0,[],[],[],[],[],[],
5017,8006-90-4,https://drugs.ncats.io/drug/AV092KU4JH,"['PEPPERMINT OIL', 'PEPPERMINT OIL YAKIMA', 'P...",PEPPERMINT OIL,[],,6850741.0,,,,...,,0,1921.0,[],[],[],[],[],[],C015424


In [14]:
final_df = pd.concat([df_merged, df_cas], axis=1)

In [15]:
final_df.to_csv('task_2_unamtched_valid_cas_60s.csv', index=False)

In [16]:
final_df = pd.concat([molecules_df, final_df], axis=1)
final_df

Unnamed: 0,cas,url,common,preferred,brand,inn,pubchem,who_atc,originator,stereochemistry,...,condition_list,patent_list,pubmed,pmid,mesh,Parsed,Link,Result Type,CAS,Deprecated CAS
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,"['Constipation', 'Hepatic Encephalopathy']","['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED,585-86-4,
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,['Migraine Disorders'],"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED,"1289023-67-1, 1374024-48-2",
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,"['Hypercholesterolemia', 'Dyslipidemias']","['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED,738606-46-7,
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,"['Schizophrenia', 'Psychotic symptoms']","['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED,"71675-85-9, 53583-79-2",
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,[],[],[],[],C522201,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED,308242-62-8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,,,,,,,,,,,...,,,,,,SERACTIDE ACETATE,https://pubchem.ncbi.nlm.nih.gov/compound/1189...,FEATURED,39295-97-1,
3641,,,,,,,,,,,...,,,,,,FOMIVIRSEN SODIUM,https://pubchem.ncbi.nlm.nih.gov/compound/Fomi...,FEATURED,160369-77-7,
3644,,,,,,,,,,,...,,,,,,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2"
3645,,,,,,,,,,,...,,,,,,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED,54531-52-1,


In [17]:
final_df.columns=['url', 'common', 'preferred', 'brand', 'inn', 'pubchem', 'who_atc', 'cas', 'originator', 'stereochemistry', 'formula', 'mw', 'investigational', 'us_market', 'us_market_year', 'us_prev_market', 'first_year_approval', 'fda_links', 'target_list', 'condition_list', 'patent_list', 'pubmed', 'pmid', 'mesh', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS']
# final_df.to_csv('margaret_missing_cas._identified.csv', index=False)

# Flagging Compound Types

In [18]:
final_df['Compound/Substance'] = final_df['Link'].apply(lambda x: 'COMPOUND' if 'compound' in str(x) else ('SUBSTANCE' if 'substance' in str(x) else 'N/A'))

In [19]:
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,"['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED,585-86-4,,COMPOUND
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED,"1289023-67-1, 1374024-48-2",,COMPOUND
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,"['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED,738606-46-7,,COMPOUND
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,"['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED,"71675-85-9, 53583-79-2",,COMPOUND
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,[],[],[],C522201,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED,308242-62-8,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,,,,,,,,,,,...,,,,,SERACTIDE ACETATE,https://pubchem.ncbi.nlm.nih.gov/compound/1189...,FEATURED,39295-97-1,,COMPOUND
3641,,,,,,,,,,,...,,,,,FOMIVIRSEN SODIUM,https://pubchem.ncbi.nlm.nih.gov/compound/Fomi...,FEATURED,160369-77-7,,COMPOUND
3644,,,,,,,,,,,...,,,,,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2",COMPOUND
3645,,,,,,,,,,,...,,,,,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED,54531-52-1,,COMPOUND


In [20]:
# final_df['Original Molecule'] = final_df['Parsed Molecule']
# final_df = final_df[['Original Molecule', 'Parsed Molecule', 'Link', 'Result Type', 'CAS', 'Deprecated CAS', 'Compound/Substance']]
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,"['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED,585-86-4,,COMPOUND
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED,"1289023-67-1, 1374024-48-2",,COMPOUND
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,"['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED,738606-46-7,,COMPOUND
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,"['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED,"71675-85-9, 53583-79-2",,COMPOUND
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,[],[],[],C522201,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED,308242-62-8,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,,,,,,,,,,,...,,,,,SERACTIDE ACETATE,https://pubchem.ncbi.nlm.nih.gov/compound/1189...,FEATURED,39295-97-1,,COMPOUND
3641,,,,,,,,,,,...,,,,,FOMIVIRSEN SODIUM,https://pubchem.ncbi.nlm.nih.gov/compound/Fomi...,FEATURED,160369-77-7,,COMPOUND
3644,,,,,,,,,,,...,,,,,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2",COMPOUND
3645,,,,,,,,,,,...,,,,,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED,54531-52-1,,COMPOUND


# Export Dataframe as CSV

In [21]:
# final_df_csv = final_df.to_csv('pubchem_longer_wait.csv', index = False) 
final_df.to_csv('unmatched_valid_margaret_no_pubchem.csv', index=False)

In [22]:
final_df

Unnamed: 0,url,common,preferred,brand,inn,pubchem,who_atc,cas,originator,stereochemistry,...,patent_list,pubmed,pmid,mesh,Parsed Molecule,Link,Result Type,CAS,Deprecated CAS,Compound/Substance
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,"['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED,585-86-4,,COMPOUND
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED,"1289023-67-1, 1374024-48-2",,COMPOUND
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,"['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED,738606-46-7,,COMPOUND
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,"['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED,"71675-85-9, 53583-79-2",,COMPOUND
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,[],[],[],C522201,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED,308242-62-8,,COMPOUND
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3637,,,,,,,,,,,...,,,,,SERACTIDE ACETATE,https://pubchem.ncbi.nlm.nih.gov/compound/1189...,FEATURED,39295-97-1,,COMPOUND
3641,,,,,,,,,,,...,,,,,FOMIVIRSEN SODIUM,https://pubchem.ncbi.nlm.nih.gov/compound/Fomi...,FEATURED,160369-77-7,,COMPOUND
3644,,,,,,,,,,,...,,,,,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2",COMPOUND
3645,,,,,,,,,,,...,,,,,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED,54531-52-1,,COMPOUND


In [23]:
molecules_df

Unnamed: 0,cas,url,common,preferred,brand,inn,pubchem,who_atc,originator,stereochemistry,...,us_market_year,us_prev_market,first_year_approval,fda_links,target_list,condition_list,patent_list,pubmed,pmid,mesh
0,585-86-4,https://drugs.ncats.io/drug/L2B0WJF7ZY,"['LACTITOL ANHYDROUS', 'LACTOBIOSIT', 'MIRUHEN...",LACTITOL,['IMPORTAL'],6414.0,157355.0,A06AD12,"Senderens, J.B.",ABSOLUTE,...,['2020'],0,2020.0,['NDA211281'],[],"['Constipation', 'Hepatic Encephalopathy']","['JP2000201631A', 'JP2001511442A', 'JP20041617...",['https://www.ncbi.nlm.nih.gov/pubmed/19087388...,"['19087388', '16481971', '16498257', '16553741...",C014635
1,1289023-67-1,https://drugs.ncats.io/drug/997WVV895X,"['BHV-3000', 'RIMEGEPANT [INN]', 'RIMEGEPANT [...",RIMEGEPANT,[],9751.0,51049968.0,,Bristol-Myers Squibb,ABSOLUTE,...,['2020'],0,2020.0,['NDA212728'],['Calcitonin gene-related peptide type 1 recep...,['Migraine Disorders'],"['20110251223', '20120245356']",['https://www.ncbi.nlm.nih.gov/pubmed/26650258'],['26650258'],
2,738606-46-7,https://drugs.ncats.io/drug/1EJ6Z6Q368,"['PENTADECANEDIOIC ACID, 8-HYDROXY-2,2,14,14-T...",BEMPEDOIC ACID,[],9891.0,10472693.0,,Esperion Therapeutics,ACHIRAL,...,['2020'],0,2020.0,['NDA211617'],"['ATP-citrate synthase', 'AMPK alpha1$$alpha2']","['Hypercholesterolemia', 'Dyslipidemias']","['20050043278', '20070179120', '7335799', '781...",['https://www.ncbi.nlm.nih.gov/pubmed/1371749'],['1371749'],
3,71675-85-9,https://drugs.ncats.io/drug/8110R61I4U,"['AMISULPRIDE [EP]', 'AMISULPRIDE [WHO-DD]', '...",AMISULPRIDE,"['SULAMID', 'SOLIAN', 'DENIBAN', 'SOCIAN']",4960.0,2159.0,N05AL05,Sanofi,RACEMIC,...,['2020'],0,2020.0,['NDA209510'],"['Dopamine D3 receptor', 'Serotonin 7 (5-HT7) ...","['Schizophrenia', 'Psychotic symptoms']","['JP2001501192A', 'JP2002527464A', 'JP4178032B2']",['https://www.ncbi.nlm.nih.gov/pubmed/1354163'...,"['1354163', '11803729', '12693427', '12442883'...",
4,308242-62-8,https://drugs.ncats.io/drug/7V4A8U16MB,"['REMIMAZOLAM [INN]', 'REMIMAZOLAM [WHO-DD]']",REMIMAZOLAM,[],9232.0,9867812.0,,Glaxo Smith Kline,ABSOLUTE,...,['2020'],0,2020.0,['NDA212295'],['GABA-A receptor; benzodiazepine site'],[],[],[],[],C522201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5011,9005-35-0,https://drugs.ncats.io/drug/8P20S56HZI,"['ALGINATES: AMMONIUM, CALCIUM, POTASSIUM AND ...",CALCIUM ALGINATE,[],,44630049.0,B02BC08,"Moissan, H.",,...,,0,1921.0,[],[],"['Cystic Fibrosis', 'Hypocalcemia', 'Chronic O...","['20090030026', '20130174651', '3966773', '397...",['https://www.ncbi.nlm.nih.gov/pubmed/1220513'...,"['1220513', '11572466', '11513607', '11592964'...",
5012,9006-68-2,https://drugs.ncats.io/drug/8B8GHO5O27,"['POLYBENZARSOL [INN]', 'POLYBENZARSOL [MI]']",POLYBENZARSOL,['BENZODOL'],727.0,68716.0,,"Butlerov, A.",,...,,0,1921.0,[],['DNA'],['Warts'],"['3944600', '3954756', '3969409', '3976653', '...",['https://www.ncbi.nlm.nih.gov/pubmed/1597190'...,"['1597190', '14666255', '14656365', '14979077'...",
5014,9011-04-5,https://drugs.ncats.io/drug/4C905MSK4W,"['HEXADIMETHRINE BROMIDE [INN]', ""POLY(N,N,N',...",HEXADIMETHRINE BROMIDE,['POLYBRENE'],719.0,24769.0,,,,...,,1,1959.0,[],[],[],[],[],[],
5017,8006-90-4,https://drugs.ncats.io/drug/AV092KU4JH,"['PEPPERMINT OIL', 'PEPPERMINT OIL YAKIMA', 'P...",PEPPERMINT OIL,[],,6850741.0,,,,...,,0,1921.0,[],[],[],[],[],[],C015424


In [24]:
final_df = pd.concat([df_merged, df_cas], ignore_index=True, sort=False, axis=1)
final_df

Unnamed: 0,0,1,2,3,4
0,LACTITOL,https://pubchem.ncbi.nlm.nih.gov/compound/157355,FEATURED,585-86-4,
1,RIMEGEPANT,https://pubchem.ncbi.nlm.nih.gov/compound/5104...,FEATURED,"1289023-67-1, 1374024-48-2",
2,BEMPEDOIC ACID,https://pubchem.ncbi.nlm.nih.gov/compound/1047...,FEATURED,738606-46-7,
3,AMISULPRIDE,https://pubchem.ncbi.nlm.nih.gov/compound/2159,FEATURED,"71675-85-9, 53583-79-2",
4,REMIMAZOLAM,https://pubchem.ncbi.nlm.nih.gov/compound/9867812,FEATURED,308242-62-8,
...,...,...,...,...,...
3644,CALCIUM ALGINATE,https://pubchem.ncbi.nlm.nih.gov/compound/Calc...,FEATURED,9005-35-0,"37228-92-5, 9019-42-5, 9019-43-6, 9060-20-2"
3645,POLYBENZARSOL,https://pubchem.ncbi.nlm.nih.gov/compound/Poly...,FEATURED,54531-52-1,
3646,HEXADIMETHRINE BROMIDE,https://pubchem.ncbi.nlm.nih.gov/compound/Hexa...,FEATURED,"28728-55-4, 9011-04-5","32036-84-3, 117848-85-8, 62766-74-9, 9011-04-5"
3647,PEPPERMINT OIL,https://pubchem.ncbi.nlm.nih.gov/compound/Pepp...,FEATURED,8006-90-4,
