# Email Scraper

In [None]:
import grequests
from bs4 import BeautifulSoup
import csv
import re
import json
from urllib.parse import urlparse, unquote

read list of URLs to scrape

In [None]:
IN_FILE_DIR = './in/'
IN_FILE_NAME = 'in.json'

with open(IN_FILE_DIR + IN_FILE_NAME) as f:
    urls = json.load(f)

print(f"Total urls: {len(urls):0}")
urls

scrape emails and phone numbers from anchor tags with mailto and tel  links

In [None]:
def is_valid_email_address(string):
  email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,7}\b')
  return email_pattern.fullmatch(string) != None

def is_valid_phone_number(string):
  #TODO
  return True

def deduplicate_list_preserve_order(seq):
  return list(dict.fromkeys(seq))

def extract_email_address(soup):
  # Find all 'mailto:' links
  anchors_with_mailto_links = soup.find_all('a', href=re.compile(r'^mailto:'))

  # Extract email addresses from 'mailto:' links
  email_addresses = []
  for anchor in anchors_with_mailto_links:
    mailto_link = anchor['href']
    parsed_link = urlparse(mailto_link, "mailto")
    
    email = parsed_link.path
    email = unquote(email)
    email = email.strip()
    
    if is_valid_email_address(email):
      email_addresses.append(email)

  return deduplicate_list_preserve_order(email_addresses)

def extract_phone_numbers(soup):
  # Find all 'mailto:' links
  anchors_with_mailto_links = soup.find_all('a', href=re.compile(r'^tel:'))

  # Extract email addresses from 'mailto:' links
  phone_numbers = []
  for anchor in anchors_with_mailto_links:
    mailto_link = anchor['href']
    parsed_link = urlparse(mailto_link, "tel")
    
    phone = parsed_link.path
    phone = phone.strip()
    
    if is_valid_phone_number(phone):
      phone_numbers.append(phone)

  return deduplicate_list_preserve_order(phone_numbers)

In [None]:
requests = (grequests.get(u) for u in urls)

contacts = []
for response in grequests.imap(requests, size=10):
  soup = BeautifulSoup(response.content, 'html.parser')
  
  email_addresses = extract_email_address(soup)
  phone_numbers = extract_phone_numbers(soup)
  
  contacts.append({
    "title":  soup.title.string,
    "source": response.url,
    "emails": email_addresses,
    "phones": phone_numbers,
  })

contacts

write scraped contact_info to file

In [None]:
OUT_FILE_DIR = './out/'
OUT_FILE_NAME = 'out.csv'

with open(OUT_FILE_DIR + OUT_FILE_NAME, 'w+') as f:
  fieldnames = [
    'Title', 
    'Source', 
    'Email', 
    'Phone'
  ]
  writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator='\n', delimiter=',')
  writer.writeheader()
  
  for id, contact in enumerate(contacts):
    writer.writerow({
      'Title': contact['title'], 
      'Source': contact['source'], 
      'Email': contact['emails'][0] if contact['emails'] else "", 
      'Phone': contact['phones'][0] if contact['phones'] else "", 
    })