This code scrapes individual Craigslist posts, extracting a contact email address and other metadata.

It reads in the post URLs to be scraped from a CVS file, and saves the extracted metadata into another CSV file. See also the [data flow overview](https://docs.google.com/presentation/d/1ug_iXh5ZUFRYexmZSmq_uq3TI7Yvbh03kgC_SBAobmU/edit)

In [1]:
import socks
import socket
import urllib
from urllib import request
from selenium import webdriver

from bs4 import BeautifulSoup
import json
import re

import time
from time import sleep
import datetime

import pandas as pd
import numpy as np
from pathlib import Path
from random import random

In [2]:
# file to read in post URLs from: 
posts_metadata_from_lists_file_name = '../../data/raw/posts_metadata_from_lists.csv'
# file to save metadata extracted from individual posts to:
posts_metadata_from_individual_posts_file_name = '../../data/raw/posts_metadata_from_individual_posts.csv'

In [3]:
if not Path(posts_metadata_from_individual_posts_file_name).is_file():
    # initialize file with headers on first use:
    !echo URL,contact_email,updated,geo_position,removed,request_datetime > {posts_metadata_from_individual_posts_file_name}
else:
    previous_metadata_from_individual_posts = \
    pd.read_csv(posts_metadata_from_individual_posts_file_name,
                parse_dates = ['updated','request_datetime'])

In [4]:
previous_metadata_from_individual_posts

Unnamed: 0,URL,contact_email,updated,geo_position,removed,request_datetime
0,https://dallas.craigslist.org/dal/roo/d/dallas...,272356b7306d36a4b8113cda9dfd3167@hous.craigsli...,2021-11-15 18:47:31-06:00,32.875600;-96.749500,,2021-11-30 20:41:59+00:00
1,https://boston.craigslist.org/gbs/roo/d/brookl...,2d9663b34c1a38c4aa265d5ae933e1dd@hous.craigsli...,2021-11-29 10:02:22-05:00,42.348543;-71.116607,,2021-11-30 20:42:27+00:00
2,https://phoenix.craigslist.org/cph/roo/d/phoen...,5ce96b653037368db512d27dc9ec5a24@hous.craigsli...,2021-11-21 13:53:55-07:00,33.476443;-111.996001,,2021-11-30 20:42:54+00:00


In [5]:
# connect via Tor (requires Tor Browser running)
# adapted from https://stackoverflow.com/a/64700370
profile = webdriver.FirefoxProfile()
myProxy = "localhost:9150"
ip, port = myProxy.split(':')
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.socks', ip)
profile.set_preference('network.proxy.socks_port', int(port))
driver = webdriver.Firefox(firefox_profile=profile)

# double-check that we got a Tor IP:
# load Tor check JSON result, circumventing the inbuilt JSON viewer of Firefox
# (cf. https://stackoverflow.com/a/37121878 )
driver.get('view-source:https://check.torproject.org/api/ip')
torcheck = json.loads(driver.find_element_by_tag_name("pre").text)
if not torcheck['IsTor']:
    raise ConnectionError('Looks like we are not using the Tor proxy')

In [6]:
# NB: Selenium might leave behind temporary browser profiles 
# in the /tmp folder after each run.
# delete those if they take up too much space

In [7]:
def get_post_metadata(post_url, maxattempts=3):
    """
    Attempts to retrieve the anonymous contact email address 
    and other public metadata for a Craiglist ad
    Returns '' if unsuccessful after trying $maxattempts times 
    """
    
    driver = webdriver.Firefox(firefox_profile=profile)

    driver.implicitly_wait(1)
    # NB might still want to look reusing the browsing context (instead of
    # resetting driver = ...) but delete cookies, local storage etc.
    # delete cookies for this domain only:
    # driver.delete_all_cookies()
    
    contact_email = ''
    removed = ''
    attempts = 0
    
    
    while (contact_email == '') and (removed == '') and (attempts < maxattempts):
    
        attempts +=1
        print('Attempt',attempts,'for',post_url,'...')
        
        driver.get(post_url) # todo: add error handling for broken post URL
        
        
        # detect whether post was flagged for removal 
        try:
            removed = driver.find_element_by_class_name("removed").text
        except:
            removed = ''
        
        try:
            updated = driver.find_element_by_class_name("postinginfos").\
                find_element_by_class_name("postinginfo.reveal").\
                find_element_by_class_name("date.timeago").\
            get_attribute("datetime")
        except:
            updated = ''
        # todo: abandon if updated more than x days ago?
       
        
        try: 
            geo_position = driver.find_element_by_xpath("//meta[@name='geo.position']").get_attribute("content")
        except:
            geo_position = ''

        
        try:
            
            sleep(3+3*random()) # in addition to driver.implicitly_wait
            
            try:
                # click "reply" button:
                js = 'document.getElementsByClassName("reply-button").item(0).click()'
                driver.execute_script(js)
                # use https://www.selenium.dev/selenium/docs/api/dotnet/html/M_OpenQA_Selenium_Remote_RemoteWebElement_Click.htm 
                # instead of JS?
                
                # click "show email" button 
                # (which may appear with some delay after clicking "reply"):
                sleep(3+2*random()) # in addition to driver.implicitly_wait
                js = 'document.getElementsByClassName("show-email").item(0).click()'
                driver.execute_script(js)
            
            except: 
                # click different version of the "reply" button that also shows up:
                
                try:
                    js = 'document.getElementsByClassName("reply-info js-only").item(0).click()'
                    driver.execute_script(js)
                except:
                    try:
                        js = 'document.getElementsByClassName("reply-button js-only").item(0).click()'
                        driver.execute_script(js)
                    except:
                        pass

                
                # click "show email" button 
                # (which may appear with some delay after clicking "reply"):
                sleep(3+2*random()) # in addition to driver.implicitly_wait
                js = 'document.getElementsByClassName("reply-info js-only").item(0).getElementsByClassName("show-email").item(0).click()'
                driver.execute_script(js)
            
           
            # get email address from popup:
            js = 'return document.getElementsByClassName("anonemail").item(0).value'
            sleep(2+1*random()) # in addition to driver.implicitly_wait
            try: 
                contact_email = driver.execute_script(js)
            except: # for some reason this sometimes fails on first try (only), 
                # so try once more:
                sleep(3+1*random()) # in addition to driver.implicitly_wait
                contact_email = driver.execute_script(js)
                        
        except:
            print('Error executing "'+js+'"')
            # todo: detect presence of captcha (via class="h-captcha"...),
            # wait longer if there is one
            if attempts < maxattempts:
                sleep(11*attempts+5*random())
    
    if removed != '':
        print('Looks like the post has been removed ("'+removed+'").')
        print('We\'ll return an empty result')
        print('')
    elif contact_email == '':
        print('Giving up on extracting contact email after',\
              attempts,'unsuccessful attempts :(')
        print()
        # NB: this might also be because the ad offers only a  
        # phone number for contact, e.g. 
        # https://newyork.craigslist.org/brk/roo/d/brooklyn-enormous-lovely-bushwick-room/7414087734.html
    else:        
        print('success')
        print()
    
    
    return contact_email, updated, geo_position, removed

In [8]:
# import post URLs to be scraped:
if not Path(posts_metadata_from_lists_file_name).is_file():
    raise FileNotFoundError(posts_metadata_from_lists_file_name+' not found')
posts_metadata_from_lists = \
    pd.read_csv(posts_metadata_from_lists_file_name,
                parse_dates = ['posted'])

In [9]:
# spot-check import result:
posts_metadata_from_lists.sample(10)

Unnamed: 0,index,area_string,posted,neighborhood,post title,number_bedrooms,sqft,URL,price,requested
1377,1377,sfbay,2021-11-29 07:42:00,(Milpitas south bay area ),welcome to our humble aboad! three bedroom home,3.0,,https://sfbay.craigslist.org/sby/roo/d/milpita...,1000,2021-11-29 09h33m55s
1211,1211,phoenix,2021-11-28 21:17:00,(Surprise west valley ),Beautiful room within Surprise!,1.0,,https://phoenix.craigslist.org/wvl/roo/d/surpr...,750,2021-11-29 09h33m42s
2232,2232,denver,2021-11-27 12:24:00,(Mission Viejo ),Room 4 rent $600 all,1.0,,https://denver.craigslist.org/roo/d/aurora-roo...,600,2021-11-29 09h35m30s
1982,1982,sandiego,2021-11-28 14:15:00,(San Diego south san diego county ),Room for rent / Cuarto en renta,1.0,,https://sandiego.craigslist.org/ssd/roo/d/room...,650,2021-11-29 09h35m03s
1761,1761,seattle,2021-11-28 14:48:00,( seattle ),Private Room and Bath for Rent In Seattle - Mo...,2.0,1100.0,https://seattle.craigslist.org/see/roo/d/seatt...,401,2021-11-29 09h34m35s
709,709,washingtondc,2021-11-28 20:18:00,(Near Northeast / H Street Corridor district ...,Sunny room available in historic row house nea...,1.0,,https://washingtondc.craigslist.org/doc/roo/d/...,1400,2021-11-29 09h32m35s
673,673,washingtondc,2021-11-29 06:36:00,"(Herndon, VA. northern virginia )",*TWO ROOMS FOR PROFESSIONAL FEMALE ROOMMATE PL...,,,https://washingtondc.craigslist.org/nva/roo/d/...,1000,2021-11-29 09h32m35s
436,436,dallas,2021-11-26 17:02:00,(Little elm mid cities ),Room for rent,,,https://dallas.craigslist.org/mdf/roo/d/little...,0,2021-11-29 09h32m08s
120,120,losangeles,2021-11-29 09:27:00,(Fairfax/Melrose Ave central LA 213/323 ),Modern Hollywood 2 bedroom 2 blocks from Grove,2.0,,https://losangeles.craigslist.org/lac/roo/d/lo...,1500,2021-11-29 09h31m41s
1387,1387,sfbay,2021-11-29 07:13:00,(marina / cow hollow),Private Bedroom December Rental + Parking 4 be...,4.0,3000.0,https://sfbay.craigslist.org/sfc/roo/d/san-fra...,1995,2021-11-29 09h33m55s


In [10]:
%%time
# test run for a sample of posts
post_urls_sample = posts_metadata_from_lists.URL.sample(10)

post_urls = []
contact_emails = []
updateds = []
geo_positions = []
request_datetimes = []
removeds = []

for post_url in post_urls_sample:
    print(post_url)
    
    if post_url in previous_metadata_from_individual_posts.URL.unique():
        print('...already scraped earlier!') 
        # todo: decide if we rather want to do this deduplication later  
        # in the emailer script
    else:
        post_urls.append(post_url)
        
        contact_email, updated, geo_position, removed = get_post_metadata(post_url)
        contact_emails.append(contact_email)
        updateds.append(updated)
        geo_positions.append(geo_position)
        removeds.append(removed)
        
        request_datetime = datetime.datetime.utcnow().\
                            strftime('%Y-%m-%d %H:%M:%S+0000')
        # timezone format consistent with "updated" field
        request_datetimes.append(request_datetime)    


metadata_from_individual_posts = pd.DataFrame({
    'URL': post_urls,
    'contact_email': contact_emails,
    'updated': updateds,
    'geo_position': geo_positions,
    'removed': removeds,
    'request_datetime': request_datetimes})

with open(posts_metadata_from_individual_posts_file_name, 'a') \
    as posts_metadata_from_individual_posts_file:
    metadata_from_individual_posts.to_csv(\
        posts_metadata_from_individual_posts_file, \
        header=False, index=False) 
    # don't save index, as we append to existing data

https://losangeles.craigslist.org/lac/roo/d/los-angeles-modern-hollywood-bedroom/7414109879.html
Attempt 1 for https://losangeles.craigslist.org/lac/roo/d/los-angeles-modern-hollywood-bedroom/7414109879.html ...
success

https://houston.craigslist.org/roo/d/houston-room-for-rent/7408465767.html
Attempt 1 for https://houston.craigslist.org/roo/d/houston-room-for-rent/7408465767.html ...
success

https://losangeles.craigslist.org/sgv/roo/d/san-dimas-2000-month-free-room-for-live/7413947408.html
Attempt 1 for https://losangeles.craigslist.org/sgv/roo/d/san-dimas-2000-month-free-room-for-live/7413947408.html ...
Error executing "document.getElementsByClassName("reply-info js-only").item(0).getElementsByClassName("show-email").item(0).click()"
Looks like the post has been removed ("This posting has been flagged for removal. [?]").
We'll return an empty result

https://denver.craigslist.org/roo/d/golden-bedroom-in-golden-utilities-and/7409966920.html
Attempt 1 for https://denver.craigslist.o

In [11]:
# check result:
metadata_from_individual_posts

Unnamed: 0,URL,contact_email,updated,geo_position,removed,request_datetime
0,https://losangeles.craigslist.org/lac/roo/d/lo...,ec746338f34037afaa0434a568d72033@hous.craigsli...,2021-11-29T09:23:21-0800,34.078683;-118.358803,,2021-11-30 20:57:56+0000
1,https://houston.craigslist.org/roo/d/houston-r...,373e656ee0103219831038e0f1297667@hous.craigsli...,2021-11-15T16:48:26-0600,29.860200;-95.581700,,2021-11-30 20:58:21+0000
2,https://losangeles.craigslist.org/sgv/roo/d/sa...,,,,This posting has been flagged for removal. [?],2021-11-30 20:58:53+0000
3,https://denver.craigslist.org/roo/d/golden-bed...,256c7418fd4b3ecc8314210b18620634@hous.craigsli...,2021-11-19T06:04:05-0700,39.767871;-105.204464,,2021-11-30 20:59:23+0000
4,https://seattle.craigslist.org/see/roo/d/seatt...,1e91aa3df86f3a9d9599143f211a47e0@hous.craigsli...,2021-11-17T13:42:58-0800,47.663300;-122.302200,,2021-11-30 21:00:01+0000
5,https://baltimore.craigslist.org/roo/d/gwynn-o...,,2021-11-24T14:19:13-0500,39.323040;-76.722501,,2021-11-30 21:01:41+0000
6,https://tampa.craigslist.org/psc/roo/d/port-ri...,ad3edd30f50938dfa1d1d25cd962a6a8@hous.craigsli...,2021-11-22T17:21:39-0500,28.301100;-82.692700,,2021-11-30 21:02:12+0000
7,https://philadelphia.craigslist.org/roo/d/phil...,,2021-11-21T17:00:30-0500,39.992500;-75.113300,,2021-11-30 21:03:54+0000
8,https://losangeles.craigslist.org/sgv/roo/d/po...,,2021-11-28T22:19:19-0800,34.041111;-117.716789,,2021-11-30 21:05:33+0000
9,https://chicago.craigslist.org/nch/roo/d/chica...,352ce556b2c2399c92af5b8cb681d6ab@hous.craigsli...,2021-10-30T12:15:32-0500,41.946300;-87.806100,,2021-11-30 21:06:04+0000


In [12]:
# todo: set a specific user agent?
# cf. driver.get('http://whatsmyuseragent.org/')

In [14]:
# when excluding previously scraped post URLs from posts_metadata_from_individual_posts_file_name ,
# keep those where scraping failed (empty contact_email field)