In [5]:
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 23 17:17:45 2016
@author: Arnaud Devie
"""

#%% Data mining from Sigma Aldrich website
# Search URL by CAS number:
# http://www.sigmaaldrich.com/catalog/search?interface=CAS%20No.&term=1314-62-1&N=0&lang=en&region=US&focus=product&mode=mode+matchall
# On product page, Safety Information table, with H-statements, P-statements and PPE type

'\nCreated on Thu Jun 23 17:17:45 2016\n@author: Arnaud Devie\n'

In [6]:
#==============================================================================
# Libraries
#==============================================================================
import re
import os
import sys
import time
import pandas
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver

#==============================================================================
# Functions
#==============================================================================
def deblank(text):
    # Remove leading and trailing empty spaces
    return text.rstrip().lstrip()

def fixencoding(text):
    # Make string compatible with cp437 characters set (Windows console)
    return text.encode(encoding="cp437", errors="ignore").decode(encoding="utf-8", errors="ignore")

def deblankandcap(text):
    # Remove leading and trailing empty spaces, capitalize
    return text.rstrip().lstrip().capitalize()

def striphtml(text):
    # remove HTML tags from string (from: http://stackoverflow.com/a/3398894, John Howard)
    p = re.compile(r'<.*?>')
    return p.sub('', text)

def clean(text):
    # Deblank, fix encoding and strip HTML tags at once
    return striphtml(fixencoding(deblank(text)))

In [7]:
#==============================================================================
# Input
#==============================================================================
# Looking for info about chemical identified by CAS number ...
CASlist = list()
textfile = open('CAS-list.txt','r')
for line in textfile:
    CASlist.append(deblank(line.replace('\n','')))

textfile.close()

# Drop duplicates
CASlist = set(CASlist)

# Clean up
if '' in CASlist:
    CASlist.remove('')

display(CASlist)

{'110-71-4', '646-06-0'}

In [8]:
#%%
#==============================================================================
# Search patterns
#==============================================================================
Ppattern = '(P[0-9]{3}[0-9P\+]*)' # the letter P followed by 3 digits, including '+' combo
#Hpattern = 'H[0-9]{3}' # the letter H followed by 3 digits
Hpattern = '(H[0-9]{3}(?i)[ifd0-9H\+]*)' # the letter H followed by 3 digits, including '+' combo, case insensitive fd

# Parse H2P text file
# alternate syntax : with open('') as file:
textfile = open('H2P.txt', 'r')

# Initialize dictionary
H2P = dict()

for line in textfile:
    line = line.replace('\n','').replace('+ ','+') #.replace(',','')
    if re.match(Hpattern, line):
        hcode = re.match(Hpattern, line).group()
        H2P[hcode] = set(re.findall(Ppattern, line))

# Close textfile
textfile.close()

# Parse P-statements text file
textfile = open('P-statements.txt', 'r')

# Initialize dictionary
Pstatements = dict()

for line in textfile:
    line = line.replace('\n','').replace(' + ','+')
    if re.match(Ppattern, line):
        pcode = deblank(re.match(Ppattern, line).group())
        Pstatements[pcode] = deblank(line.split(pcode)[-1])

# Close textfile
textfile.close()

# Parse H-statements text file
textfile = open('H-statements.txt', 'r')

# Initialize dictionary
Hstatements = dict()

for line in textfile:
    line = line.replace('\n','').replace(' + ','+')
    if re.match(Hpattern, line):
        hcode = deblank(re.match(Hpattern, line).group())
        Hstatements[hcode] = deblank(line.split(hcode)[-1])

# Close textfile
textfile.close()

In [9]:
#==============================================================================
# Prevention, Response, Storage and Disposal P-statement from H-code
#==============================================================================
H2Prevention = dict()
H2Response = dict()
H2Storage = dict()
H2Disposal = dict()

for hcode in H2P:
    alist = H2Prevention.get(hcode,[])
    for pcode in H2P[hcode]:
        statement = Pstatements[pcode]
        if (pcode[1]=='2'): H2Prevention[hcode] = H2Prevention.get(hcode,[]); H2Prevention[hcode].append(statement)
        if (pcode[1]=='3'): H2Response[hcode]   = H2Response.get(hcode,[]); H2Response[hcode].append(statement)
        if (pcode[1]=='4'): H2Storage[hcode]    = H2Storage.get(hcode,[]); H2Storage[hcode].append(statement)
        if (pcode[1]=='5'): H2Disposal[hcode]   = H2Disposal.get(hcode,[]); H2Disposal[hcode].append(statement)

In [10]:
#%%
#==============================================================================
# Data mining Sigma Aldrich website
#==============================================================================

# Start Chrome instance
chromeOptions = webdriver.ChromeOptions()

if "SDS" not in os.listdir():
    os.mkdir("SDS")

prefs = {"download.default_directory" : os.path.join(os.getcwd(),"SDS"),
         "download.prompt_for_download" : False,
         "download.directory_upgrade" : True,
         "plugins.plugins_disabled" : ["Chrome PDF Viewer"]}
chromeOptions.add_experimental_option("prefs",prefs)
chromeOptions.add_argument("--disable-extensions")

# if 'win' in sys.platform: # Windows
#     chromedriver = os.path.join(os.getcwd(),'chromedriver','win32','chromedriver.exe')
# elif 'darwin' in sys.platform: # Mac OS
#     chromedriver = os.path.join(os.getcwd(),'chromedriver','mac32','chromedriver')
# elif 'linux' in sys.platform: # Linux
#     if sys.maxsize > 2**32: # 64-bit
#         chromedriver = os.path.join(os.getcwd(),'chromedriver','linux64','chromedriver')
#     else: # 32-bit
#         chromedriver = os.path.join(os.getcwd(),'chromedriver','linux32','chromedriver')

chromedriver = os.path.join(os.getcwd(),'chromedriver','chromedriver')
driver = webdriver.Chrome(executable_path=chromedriver, options=chromeOptions)
# driver.set_window_position(-2000, 0)


In [32]:
chemicals=list()
CASdict = dict()
badCAS = list()

CAS = '110-71-4'

chemical = dict()
URL = dict()
Name = ''

# Store CAS #
chemical['CAS'] = CAS
print(CAS)


# Webscraping search page
searchURL = r'http://www.sigmaaldrich.com/catalog/search?interface=CAS%20No.&term=[INSERT-HERE]&N=0&lang=en&region=US&focus=product&mode=mode+matchall'.replace('[INSERT-HERE]',CAS)
# searchURL
# searchURL = 'https://www.dataquest.io/blog/web-scraping-python-using-beautiful-soup/'
import requests
header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} 
# response = requests.get(url,headers=header)
webpage = requests.get(searchURL,headers=header)
webpage

# webpage = urllib.request.urlopen(searchURL).read()

soup = BeautifulSoup(webpage.content, "html.parser") # ===> Sigma is blocking BS4 scrape here
# product = soup.find("li", class_='productNumberValue')
# print(product)

# productSubURL = product.a.decode().split('"')[1]
# sds = soup.find("li", class_='msdsValue')
# pattern = '\'(\w*)\'' # any string between ''
# [country, language, productNumber, brand] = re.findall(pattern, sds.a.get('href'))
# properties = soup.find("ul", class_="nonSynonymProperties")
# formula = striphtml(properties.span.decode_contents())

# Webscraping product page
productURL = 'https://www.sigmaaldrich.com/GB/en/product/sial/259527'
# productURL = 'http://www.sigmaaldrich.com[INSERT-HERE]'.replace('[INSERT-HERE]', productSubURL)
webpage2 = requests.get(productURL,headers=header)
# webpage2 = urllib.request.urlopen(productURL).read()
soup2 = BeautifulSoup(webpage2.content, "html.parser")

# # Store URLs
# chemical['SearchURL'] = searchURL
# chemical['ProductURL'] = productURL
# chemical['ProductNumber'] = productNumber
# chemical['Brand'] = brand
# chemical['Formula'] = formula


# Name (compatible with cp437 characters set)
# Name = clean(soup2.find("h1", itemprop="name").decode_contents().split('\n')[1])
# chemical['Name'] = Name
# CASdict[CAS] = Name
# print(Name)

# Synonyms
try:
    Synonyms = [clean(synonym) for synonym in soup2.find("p", class_="synonym").findNext("strong").decode_contents().replace('\t','').replace('\n','').split(',')]
    chemical['Synonyms'] = Synonyms
except:
    print('No Synonyms listed for %s - %s' % (CAS, Name))

110-71-4
No Synonyms listed for 110-71-4 - 


In [22]:
searchURL


'http://www.sigmaaldrich.com/catalog/search?interface=CAS%20No.&term=110-71-4&N=0&lang=en&region=US&focus=product&mode=mode+matchall'

In [31]:
Name = clean(soup2.find("h1", itemprop="product-name").decode_contents().split('\n')[1])
Name

AttributeError: 'NoneType' object has no attribute 'decode_contents'

In [30]:
soup2.find_all('h1')

[<h1 class="MuiTypography-root jss203 MuiTypography-h1"><span id="product-name">1,2-Dimethoxyethane</span></h1>]

In [33]:
# print(soup2.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en">
 <head>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta charset="utf-8"/>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="#503191" name="theme-color"/>
  <script data-dtconfig="rid=RID_-777642224|rpid=-1336424617|domain=sigmaaldrich.com|reportUrl=/rb_bf68711lgo|app=49e38e2e60c8cd4b|dsss=1|rcdec=1209600000|featureHash=ICA27Vfhjqrux|vcv=2|rdnt=1|uxrgce=1|bp=3|srmcrv=10|cuc=9avov1ci|mdl=mdcc5=20|mel=100000|md=mdcc1=c_ga,mdcc2=dgclid,mdcc3=bnavigator.userAgent,mdcc4=bdocument.referrer,mdcc5=a#order-acknowledge-view-order ^rb span,mdcc6=ddclid,mdcc7=cGUID,mdcc8=a#onetrust-accept-btn-handler,mdcc9=a#onetrust-reject-all-handler,mdcc10=a#onetrust-close-btn-container,mdcc11=a#onetrust-pc-btn-handler|ssv=4|lastModification=1650642560652|dtVersion=10237220328075400|srmcrl=1|tp=500,50,0,1|uxdcw=1500|agentUri=/ruxitagentjs_ICA27Vfhjqrux_10237220328075400.js" src="/ruxitagentjs_ICA27Vfhjqrux_10237220328075400.js

In [None]:

# Initialize
chemicals=list()
CASdict = dict()
badCAS = list()

for CAS in CASlist:

    chemical = dict()
    URL = dict()
    Name = ''

    # Store CAS #
    chemical['CAS'] = CAS
    print(CAS)

    try:
        # Webscraping search page
        searchURL = r'http://www.sigmaaldrich.com/catalog/search?interface=CAS%20No.&term=[INSERT-HERE]&N=0&lang=en&region=US&focus=product&mode=mode+matchall'.replace('[INSERT-HERE]',CAS)
        webpage = urllib.request.urlopen(searchURL).read()
        soup = BeautifulSoup(webpage, "html.parser")
        product = soup.find("li", class_='productNumberValue')
        productSubURL = product.a.decode().split('"')[1]
        sds = soup.find("li", class_='msdsValue')
        pattern = '\'(\w*)\'' # any string between ''
        [country, language, productNumber, brand] = re.findall(pattern, sds.a.get('href'))
        properties = soup.find("ul", class_="nonSynonymProperties")
        formula = striphtml(properties.span.decode_contents())

        # Webscraping product page
        productURL = 'http://www.sigmaaldrich.com[INSERT-HERE]'.replace('[INSERT-HERE]', productSubURL)
        webpage2 = urllib.request.urlopen(productURL).read()
        soup2 = BeautifulSoup(webpage2, "html.parser")

        # Store URLs
        chemical['SearchURL'] = searchURL
        chemical['ProductURL'] = productURL
        chemical['ProductNumber'] = productNumber
        chemical['Brand'] = brand
        chemical['Formula'] = formula


        # Name (compatible with cp437 characters set)
        Name = clean(soup2.find("h1", itemprop="name").decode_contents().split('\n')[1])
        chemical['Name'] = Name
        CASdict[CAS] = Name
        print(Name)

        # Synonyms
        try:
            Synonyms = [clean(synonym) for synonym in soup2.find("p", class_="synonym").findNext("strong").decode_contents().replace('\t','').replace('\n','').split(',')]
            chemical['Synonyms'] = Synonyms
        except:
            print('No Synonyms listed for %s - %s' % (CAS, Name))

        # List of H-statements
        soloHpattern = '(H[0-9]{3}(?i)[ifd]*)'
        try:
            codes = re.findall(soloHpattern, soup2.find("div", class_="safetyRight", id="Hazard statements").findNext("a", class_="ALL").decode_contents())
            statements = [Hstatements[code] for code in codes]
            Hazards = dict(zip(codes, statements))
            chemical['Hazards'] =  Hazards
        except:
            print('No Hazards listed for %s - %s' % (CAS, Name))

        # List of P-statements
        soloPpattern = '(P[0-9]{3})'
        try:
            codes = re.findall(Ppattern, soup2.find("div", class_="safetyRight", id="Precautionary statements").findNext("a", class_="ALL").decode_contents().replace(' ',''))
            statements = [' '.join([Pstatements[solo] for solo in re.findall(soloPpattern,code)]) for code in codes]
            Precautions = dict(zip(codes, statements))
            chemical['Precautions'] =  Precautions
        except:
            print('No Precautions listed for %s - %s' % (CAS, Name))

        # List of supplemental (non-GHS) H-statements
        try:
            suppstatements = soup2.find("div", class_="safetyRight", id="Supplemental Hazard Statements").decode_contents().split(',')
            chemical['Supp. Hazards'] =  [deblank(s) for s in set(suppstatements) if deblank(s) is not '']
        except:
            print('No supp. Hazards listed for %s - %s' % (CAS, Name))

        # List of PPE
        try:
            PPElist = soup2.find("div", class_="safetyRight", id="Personal Protective Equipment").findAll("a", class_="ALL")
            PPE = [deblank(ppe.decode_contents())[0].upper() + deblank(ppe.decode_contents())[1:] for ppe in PPElist]
            chemical['PPE'] = PPE
        except:
            print('No PPE listed for %s - %s' % (CAS, Name))

        # Download SDS as PDF file
        sdsName = Name + " - SDS.pdf"
        sdsURL = os.path.join("SDS", sdsName)
        chemical['SDSfile'] = sdsURL

        if sdsName not in os.listdir('SDS'):

            driver.get("http://www.sigmaaldrich.com/MSDS/MSDS/DisplayMSDSPage.do?country=%s&language=en&productNumber=%s&brand=%s" %(country, productNumber, brand));
            print("Downloading SDS file", end='')

            timedout = False
            timeout = time.time()
            while ("PrintMSDSAction.pdf" not in os.listdir('SDS')) and not timedout:
                print(".", end='')
                timeout = time.time() - timeout
                timedout = (timeout>30)
                time.sleep(1)

            if timedout:
                print(" Timed Out! Could not get the file")
            else:
                print(" Done.")
                os.rename(os.path.join("SDS","PrintMSDSAction.pdf"), sdsURL)

        # Store chemical
        chemicals.append(chemical)

    except:
        badCAS.append(CAS)
        print('Could not process %s - %s' % (CAS, Name))
        e = sys.exc_info()[0]

# Close Chrome instance
driver.quit()

# Display
print('Processed %d chemicals out of %d CAS numbers received' % (len(chemicals),len(CASlist)))

if len(badCAS) > 0:
    print('Unable to process the following CAS numbers:')
    for cas in badCAS: print(cas)


110-71-4
