# Email Crawler
Crawler script designed to download directories of hospital websites to get staff email addresses. To add a new hospital to crawl extend the visitor class to implement the functions `filter_url` and `crawl_name_title_email_prorgam` to start crawling a new website.

There is some trail and error involved with inspecting the DOM on the profile page to implement `crawl_name_title_email_prorgam`

In [1]:
import argparse
import html
import re
import sys
import urllib.request as urllib2
from bs4 import BeautifulSoup
import requests
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
import pandas as pd
import numpy as np
import shutil
import os
from pathlib import Path

class InstiSiteVisitor(object):
    def __init__(self, root, name, shortname):
        '''
        @root: the root URL to crawl.
        @name: institution name.
        '''
        self.root = root
        self.name = name
        self.shortname = shortname

    def print_info(self):
        print('Name : ' + self.name + 
              ', root url : ' + self.root + 
              ', short name : ' + self.shortname)

    def filter_url(self, link):
        return 0
    
    def crawl_name_title_email_program(self, soup):
        '''
        @soup : beautifulsoup object that represents the profile page of the doctor in the directory
        '''
        return ['', '', '', '']
    
    def name(self):
        return self.name
    
    def root(self):
        return self.root
    
    def get_mailto_ref(self, soup):
        email = ""
        a = None
        for anchor in soup.find_all("a"):
            # find emails
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            if link.find("ailto") >= 0:
                a = anchor
                email = link.split(':')[1]
                yield [email, a]

class BronxCareVisitor(InstiSiteVisitor):
    def __init__(self):
        super().__init__('https://www.bronxcare.org/our-services/pediatrics/', 
                         'BronxCare Health System', 
                         'bronxcare')

    def filter_url(self, link):
        if link.find('www.bronxcare.org/physicians/find-a-physician/detail') >= 0:
            return 1
        if link == 'https://www.bronxcare.org/physicians/find-a-physician/detail/':
            return 0
        super().filter_url(link)
        
    def crawl_name_title_email_program(self, soup):
        div = soup.find('div', {'class':'description'})
        name = ''
        email = ''
        title = ''
        program = ''
        
        if div:
            h3 = div.find('h3')
            name_and_title = h3.text.split(",", 1)
            name = name_and_title[0].strip()
            title = name_and_title[1].strip()
            
        div = soup.find('div', {'class': 'physician-app profile'})
        if div:
            for [e, a] in super().get_mailto_ref(div):
                email = a.text
                break
        return [name, title, email, program]

class CincinattiVisitor(InstiSiteVisitor):
    def __init__(self):
        super().__init__('https://www.cincinnatichildrens.org/search/doctor-search?q=&start=0', 
                         'Cincinnati Children\'s Hospital',
                         'cincinnati')

    def crawl_name_title_email_program(self, soup):
        name = ""
        title = ""
        email = ""
        program = ""
        h1 = soup.find("h1", {"class":"person-name"})
        if h1:
            parts = h1.text.split(",", 1)
            name = parts[0]
            if len(parts) >= 2:
                title = parts[1]
            for [e, a] in self.get_mailto_ref(soup):
                email = e
                break
        return [name, title, email, program]
    
    def filter_url(self, link):
        if link.find('https://www.cincinnatichildrens.org/bio/') >= 0:
            return 1
        if link.find('https://www.cincinnatichildrens.org/search/doctor-search') >= 0:
            return 1
        return super().filter_url(link)

class UFL_HSCJ_Visitor(InstiSiteVisitor):
    def __init__(self):
        super().__init__('https://hscj.ufl.edu/pediatrics/faculty.aspx', 
                         'University of Florida',
                         'ufl_hscj')
    
    def filter_url(self, link):
        if link.find('https://hscj.ufl.edu/directory/bio/') >=0 :
            return 1
        return super().filter_url(link)
    
    def crawl_name_title_email_program(self, soup):
        name = ""
        title = ""
        email = ""
        program = ""
        title_div = soup.find("div", {"class":"titleDiv"})
        if title_div:
            splits = title_div.h1.text.split(",", 1)
            name = splits[0]
            title = splits[1]

        for anchor in soup.find_all("a"):
            # find emails
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            if link.find("mailto") >= 0:
                parent_div = anchor.find_parent("div")
                if "infoLeft" in parent_div.attrs["class"] :
                    email = link.split(":")[1]
        return [name, title, email, program]

class BostonChildrenVisitor(InstiSiteVisitor):
    def __init__(self):
        super().__init__('http://www.childrenshospital.org/directory#sort='\
                         'relevancy&f:_3BD935D7-4C32-43AD-B7CA-5F366D2450F8=[Doctor]', 
                         'Boston Children\'s Hospital',
                         'boston_children')

    def filter_url(self, link):
        if link.find('http://www.childrenshospital.org/directory/physicians/') >= 0:
            return 1
        if link.find('http://www.childrenshospital.org/directory#') >= 0:
            return 1
        return super().filter_url(link)
    
    def crawl_name_title_email_boston(self, soup):
        name = ""
        title = ""
        email = ""
        program = ""
        div = soup.find("div", {"class":"doctor-info col-xs-12 col-sm-8"})
        if div:
            print(div)
            
        return [name, title, email, program]

class Crawler(object):
    def __init__(self, df, insti_visitor, max_pages_to_crawl, crawler_timeout_s):
        '''
        @urls: a string containing the (comma separated) URLs to crawl
        @insti_visitor: visitor object related to the institution
        @max_pages_to_crawl: max pages after which the crawler should stop crawling
        @crawler_timeout_s: timeout for the get requests
        '''
        self.df = df
        self.insti_visitor = insti_visitor
        self.urls = self.insti_visitor.root.split(',')
        self.max_pages_to_crawl = max_pages_to_crawl
        self.timeout = crawler_timeout_s

    def crawl(self):
        '''
        Iterate the list of URLs and request each page, then parse it and 
        print the emails we find. 
        '''
        # a queue of urls to be crawled
        new_urls = deque(self.urls)
        processed_urls = set()
        emails = set()
        n = max_pages_to_crawl

        while new_urls and n > 0:
            n = n - 1
            url = new_urls.popleft()
            print('Processing : ' + url)

            parts = urlsplit(url)
            base_url = "{0.scheme}://{0.netloc}".format(parts)
            path = url[:url.rfind('/') + 1] if '/' in parts.path else url
            regex = r'mailto:.*.edu">'

            try:
                response = requests.get(url) #, params={'timeout':self.timeout})
            except (requests.exceptions.MissingSchema, 
                    requests.exceptions.ConnectionError,
                    requests.exceptions.InvalidURL):
                print('Skipping : ' + url)
                continue
            processed_urls.add(url)

            soup = BeautifulSoup(response.text, "lxml") # soup from page
            [name, title, email, program] = self.insti_visitor.crawl_name_title_email_program(soup)
            if name:
                self.df = self.df.append({'Name': name,
                                          'Title': title,
                                          'Email': email,
                                          'Program': program,
                                          'Institution': self.insti_visitor.name}, ignore_index=True)
                continue

            # find and process all the anchors in the document
            for anchor in soup.find_all("a"):
                # extract link url from the anchor
                link = anchor.attrs["href"] if "href" in anchor.attrs else ''
                # resolve relative links
                if link.startswith('/'):
                    link = base_url + link
                elif not link.startswith('http'):
                    link = path + link
                # add the new url to the queue if it was not enqueued nor processed yet
                if not link in new_urls and not link in processed_urls:
                    if self.insti_visitor.filter_url(link) == 1:
                        new_urls.append(link)
        return self.df

# Config setup for the script

In [2]:
# Pick the visitor based on the insitution you want to crawl from!
insti = CincinattiVisitor()
max_pages_to_crawl = 3

In [3]:
output_columns=['Name', 'Title', 'Email', 'Program', 'Institution']
df = pd.DataFrame(columns=output_columns)
crawler = Crawler(df, insti, max_pages_to_crawl, crawler_timeout_s=5)
df = crawler.crawl()
df.drop_duplicates()
emails_output_folder = os.path.join(str(Path.home()), "Desktop")
emails_output = emails_output_folder + '/emails_' + insti.shortname + '.csv'
df.to_csv(emails_output)

Processing : https://www.cincinnatichildrens.org/search/doctor-search?q=&start=0
Processing : https://www.cincinnatichildrens.org/bio/a/claire-aarnio-peterson
Processing : https://www.cincinnatichildrens.org/bio/a/katherine-abell
