In [1]:
import io, os, sys, types

from IPython import get_ipython
from nbformat import read
from IPython.core.interactiveshell import InteractiveShell

def find_notebook(fullname, path=None):
    """find a notebook, given its fully qualified name and an optional path

    This turns "foo.bar" into "foo/bar.ipynb"
    and tries turning "Foo_Bar" into "Foo Bar" if Foo_Bar
    does not exist.
    """
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        # let import Notebook_Name find "Notebook Name.ipynb"
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path

class NotebookLoader(object):
    """Module Loader for Jupyter Notebooks"""
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        """import a notebook as a module"""
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        # load the notebook object
        with io.open(path, 'r', encoding='utf-8') as f:
            nb = read(f, 4)


        # create the module and add it to sys.modules
        # if name in sys.modules:
        #    return sys.modules[name]
        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        # extra work to ensure that magics that would affect the user_ns
        # actually affect the notebook module's ns
        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.cells:
                if cell.cell_type == 'code':
                    # transform the input to executable Python
                    code = self.shell.input_transformer_manager.transform_cell(cell.source)
                    # run the code in themodule
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod

class NotebookFinder(object):
    """Module finder that locates Jupyter Notebooks"""
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            # lists aren't hashable
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]


sys.meta_path.append(NotebookFinder())

In [None]:
from urllib2 import urlopen, HTTPError, URLError
from urlparse import urlparse
from bs4 import BeautifulSoup
from strategy.ScraperContactUs import ScraperContactUs
from ExcelHandler import ExcelHandler

class Scraper(object):
    'Scraper to scrape the website'
    emails = 0
    domain = None
    
    def __init__(self, webpageUrl):
        self.webpageUrl = webpageUrl
        self.domain = self.getDomain(webpageUrl)
    
    def setWebpageUrl(self, webpageUrl):
        self.webpageUrl = webpageUrl
    
    def getWebpageUrl(self):
        return self.webpageUrl
    
    def makeSoup(self, webpageUrl = None):
        try:
            if (webpageUrl != None):
                # print webpageUrl
                page = urlopen(webpageUrl).read()
            else:
                page = urlopen(self.webpageUrl).read()
            soup = BeautifulSoup(page, "lxml")
            return soup
        except HTTPError, ex:
            print 'The server couldn\'t fulfill the request for %s.' % self.webpageUrl
            print 'Error code: ', ex.code
            return False
        except URLError, ex:
            print 'We failed to reach a server for %s.' % self.webpageUrl
            print 'Reason: ', ex.reason
            return False 
    
    def makeTextSoup(self, text):
        soup = BeautifulSoup(text, "lxml")
        return soup
    
    def getDomain(self, webpageUrl):
        parsedUri = urlparse(webpageUrl)
        return '{uri.scheme}://{uri.netloc}'.format(uri=parsedUri)


importing Jupyter notebook from strategy/ScraperContactUs.ipynb
importing Jupyter notebook from ExcelHandler.ipynb


In [None]:
# Read excel
# 05 - email
# l2 - website

wb = ExcelHandler('haha.xlsx')
ws = wb.getSheet('Sheet1')
rorieList = wb.getDataInColumns([5, 6])

print "processing Rorie List..."

for data in rorieList:
    try:
        if (data[0] == None and data[1] != None):
            scraper = Scraper(data[1])
            soup = scraper.makeSoup()
            if (soup == False):
                continue
            contactUs = ScraperContactUs(soup, scraper)

            # Get contact-us links
            contactUs.getContactLinks()
            # Get impressum links
            contactUs.getImpressumLinks()

            # check if the strategy worked i.e. the links were found
            if (contactUs.getLinkCount() > 0):
                contactUs.findEmail()
                if (contactUs.getEmailCount() > 0):
                    # save to database
                    print ', '.join(contactUs.emails)
    except:
        print "Unable to open this link: " + data[1]

print "done!"

processing Rorie List...
The server couldn't fulfill the request for http://oldwww.upol.cz/resources/jointlab/.
Error code:  404
The server couldn't fulfill the request for http://rco.upol.cz/index_en.html.
Error code:  404
Unable to open this link: http://www.pall.es
We failed to reach a server for http://www.pall.com.
Reason:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
We failed to reach a server for http://www.pall.com.
Reason:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
We failed to reach a server for http://www.pall.com.
Reason:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
We failed to reach a server for http://www.pall.com.
Reason:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
We failed to reach a server for http://www.pall.com.
Reason:  [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed (_ssl.c:661)
Unable to open this link: http://www.palomartechnologies.com
