This code extracts the URLs of new Craigslist posts from list pages such as https://sfbay.craigslist.org/search/roo?hasPic=1&availabilityMode=0 , together with some additional metadata about each post available on the list page. It saves this data to a CSV file for further processing, in particular the scraping of the individual post pages.

See also [data flow overview](https://docs.google.com/presentation/d/1ug_iXh5ZUFRYexmZSmq_uq3TI7Yvbh03kgC_SBAobmU/edit)

In [1]:
import socks
import socket
import urllib
from urllib import request

from bs4 import BeautifulSoup
import json
import re
from pathlib import Path

import time
import datetime
from time import sleep

import pandas as pd
import numpy as np

In [2]:
# connect via Tor (requires Tor Browser running)
socks.set_default_proxy(socks.SOCKS5, "localhost", 9150)
socket.socket = socks.socksocket
# https://stackoverflow.com/questions/31777692/python3-requests-with-sock5-proxy

# double-check that we got a Tor IP:
r = request.urlopen('https://check.torproject.org/api/ip')
torcheck = json.loads(r.read())
if not torcheck['IsTor']:
    raise ConnectionError('Looks like we are not using the Tor proxy')

In [3]:
# todo: set a HTTP request header?

In [4]:
# region list as compiled at https://docs.google.com/spreadsheets/d/1w9jFJ77-RNfbTJ5faHQTuyi9H5SggZAchUaxMpvfPGo/edit#gid=955170884&range=D4:D21 
 
# New York, Los Angeles, Chicago, Dallas, Houston, Washington DC, Philadelphia, Miami, 
# Atlanta, Boston, Phoenix, San Francisco, Riverside, Detroit, Seattle, Minneapolis, 
# San Diego, Tampa, Denver (CO), Baltimore
area_strings = ['newyork', 'losangeles', 'chicago', 'dallas', 'houston', 'washingtondc', \
    'philadelphia', 'miami', 'atlanta', 'boston', 'phoenix', 'sfbay', 'inlandempire', \
    'detroit', 'seattle', 'minneapolis', 'sandiego', 'tampa', 'denver', 'baltimore'] 


In [5]:
posts_metadata_file_name = '../../data/raw/posts_metadata_from_lists.csv'

# initialize file with headers on first use:
if not Path(posts_metadata_file_name).is_file():
    !echo area_string,posted,neighborhood,post title,number_bedrooms,sqft,URL,price,requested > {posts_metadata_file_name}

In [6]:
# read in current ad lists for each region and store metadata for each ad
# todo: structure this code more neatly using functions
# adapted from https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981 :

post_area_string = []
post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []
request_timing = []

for area_string in area_strings:
    
    # Search for posts in that region's "rooms & shares" which include an image:
    section_url = 'https://'+area_string+'.craigslist.org/search/roo?hasPic=1&availabilityMode=0'
    r = request.urlopen(section_url)
    request_datetime = datetime.datetime.utcnow().\
                            strftime('%Y-%m-%d %H:%M:%S+0000')
    html_soup = BeautifulSoup(r.read(), 'html.parser')
    # get the macro container for the housing posts:
    posts = html_soup.find_all('li', class_= 'result-row')
    print(area_string+': '+str(len(posts)),'posts found')
    
    for post in posts:
        
        post_area_string.append(area_string)
        request_timing.append(request_datetime)
        
        # cf. https://gist.githubusercontent.com/rileypredum/3852250bd79afa0abad9689b6ffd8675/raw/cbf1a5236eac88ef05bebd0f13f4caa0b1d4925a/loop_craigslist.py :
        
        #posting date
        #grab the datetime element 0 for date and 1 for time
        post_datetime = post.find('time', class_= 'result-date')['datetime']
        post_timing.append(post_datetime)
        # NB: doesn't carry time zone information on the website, 
        # but seems to be in the time zone of the area (e.g. PST for sfbay)
        # The update times on individual posts (scraped in the other script)
        # do contain timezone information
        
        #neighborhoods
        post_hood = post.find('span', class_= 'result-hood').text
        post_hoods.append(post_hood)
    
        #title text
        post_title = post.find('a', class_='result-title hdrlnk')
        post_title_text = post_title.text
        post_title_texts.append(post_title_text)
    
        #post link
        post_link = post_title['href']
        post_links.append(post_link)
        
        #removes the \n whitespace from each side, removes the currency symbol and comma, 
        # and turns it into an int
        post_price = int(re.sub('[\$\,]', '', post.a.text.strip())) 
        post_prices.append(post_price)
        
        if post.find('span', class_ = 'housing') is not None:
            
            #if the first element is accidentally square footage
            if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                
                #make bedroom nan
                bedroom_count = np.nan
                bedroom_counts.append(bedroom_count)
                
                #make sqft the first element
                sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
                sqfts.append(sqft)
                
            #if the length of the housing details element is more than 2
            elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                
                #therefore element 0 will be bedroom count
                bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                bedroom_counts.append(bedroom_count)
                
                #and sqft will be number 3, so set these here and append
                sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
                sqfts.append(sqft)
                
            #if there is num bedrooms but no sqft
            elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                
                #therefore element 0 will be bedroom count
                bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
                bedroom_counts.append(bedroom_count)
                
                #and sqft will be number 3, so set these here and append
                sqft = np.nan
                sqfts.append(sqft)                    
            
            else:
                bedroom_count = np.nan
                bedroom_counts.append(bedroom_count)
            
                sqft = np.nan
                sqfts.append(sqft)
            
        # if none of those conditions catch, make and sqft bedroom nan:    
        else:
            bedroom_count = np.nan
            bedroom_counts.append(bedroom_count)
            
            sqft = np.nan
            sqfts.append(sqft)
            
    time.sleep(10)
    
    

posts_metadata = pd.DataFrame({
    'area_string': post_area_string,
    'posted': post_timing,
    'neighborhood': post_hoods,
    'post title': post_title_texts,
    'number bedrooms': bedroom_counts,
    'sqft': sqfts,
    'URL': post_links,
    'price': post_prices,
    'requested': request_timing})

with open(posts_metadata_file_name, 'a') as posts_meta_data_file:
    posts_metadata.to_csv(posts_meta_data_file, header=False, index=False)
# this appends to the file if it already exists, without attemption deduplication
# of entries (this will happen later in the process)
# Not saving the index, as it would create inconsistencies with existing data
# An alternative would be to save the data in a separate file on each run:
# posts_metadata.to_csv(path_or_buf = area_string + ' posts_metadata ' + request_datetimestring+'.csv')

newyork: 120 posts found
losangeles: 120 posts found
chicago: 120 posts found
dallas: 120 posts found
houston: 120 posts found
washingtondc: 120 posts found
philadelphia: 120 posts found
miami: 120 posts found
atlanta: 120 posts found
boston: 120 posts found
phoenix: 120 posts found
sfbay: 120 posts found
inlandempire: 120 posts found
detroit: 120 posts found
seattle: 120 posts found
minneapolis: 120 posts found
sandiego: 120 posts found
tampa: 120 posts found
denver: 120 posts found
baltimore: 120 posts found


In [7]:
# todo: retrieve earlier search result pages too?

In [8]:
# quick and dirty estimate of posting frequencies per region
# (NB: doesn't consider DST changes or daily/weekly seasonality)
for area_string in area_strings:
    areaposts = posts_metadata.loc[posts_metadata['area_string'] == area_string]
    minposted_dt = datetime.datetime.strptime(min(areaposts.posted), '%Y-%m-%d %H:%M')
    maxposted_dt = datetime.datetime.strptime(max(areaposts.posted), '%Y-%m-%d %H:%M')
    num_posts = len(areaposts.posted)
    duration_estimate_hours = ((maxposted_dt - minposted_dt).total_seconds() / 3600) \
                                    * (num_posts+1)/(num_posts)
    print(area_string+': '+str(num_posts)+' posts in '+\
          str(round(duration_estimate_hours,1))+' hours ('+
          str(round(num_posts/duration_estimate_hours,1))+' posts/hour)')
 

newyork: 120 posts in 5.1 hours (23.6 posts/hour)
losangeles: 120 posts in 5.4 hours (22.2 posts/hour)
chicago: 120 posts in 32.0 hours (3.8 posts/hour)
dallas: 120 posts in 127.8 hours (0.9 posts/hour)
houston: 120 posts in 97.7 hours (1.2 posts/hour)
washingtondc: 120 posts in 16.5 hours (7.3 posts/hour)
philadelphia: 120 posts in 76.1 hours (1.6 posts/hour)
miami: 120 posts in 43.8 hours (2.7 posts/hour)
atlanta: 120 posts in 55.2 hours (2.2 posts/hour)
boston: 120 posts in 22.2 hours (5.4 posts/hour)
phoenix: 120 posts in 221.8 hours (0.5 posts/hour)
sfbay: 120 posts in 3.1 hours (38.8 posts/hour)
inlandempire: 120 posts in 98.0 hours (1.2 posts/hour)
detroit: 120 posts in 730.8 hours (0.2 posts/hour)
seattle: 120 posts in 22.1 hours (5.4 posts/hour)
minneapolis: 120 posts in 239.6 hours (0.5 posts/hour)
sandiego: 120 posts in 26.7 hours (4.5 posts/hour)
tampa: 120 posts in 269.5 hours (0.4 posts/hour)
denver: 120 posts in 75.7 hours (1.6 posts/hour)
baltimore: 120 posts in 170.8 h

In [9]:
# check results
print(posts_metadata.info())
posts_metadata.sample(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   area_string      2400 non-null   object 
 1   posted           2400 non-null   object 
 2   neighborhood     2400 non-null   object 
 3   post title       2400 non-null   object 
 4   number bedrooms  1318 non-null   object 
 5   sqft             800 non-null    float64
 6   URL              2400 non-null   object 
 7   price            2400 non-null   int64  
 8   requested        2400 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 168.9+ KB
None


Unnamed: 0,area_string,posted,neighborhood,post title,number bedrooms,sqft,URL,price,requested
366,dallas,2021-11-30 12:56,(Mesquite dallas ),Rooms For Rent,,,https://dallas.craigslist.org/dal/roo/d/mesqui...,600,2021-11-30 21:23:53+0000
216,losangeles,2021-11-30 09:22,(Los Feliz central LA 213/323 ),Female room share With other female,2.0,,https://losangeles.craigslist.org/lac/roo/d/lo...,550,2021-11-30 21:23:23+0000
1377,sfbay,2021-11-30 11:40,(los gatos),Huge Master Suite On One Acre Estate With City...,,4500.0,https://sfbay.craigslist.org/sby/roo/d/huge-ma...,1500,2021-11-30 21:25:55+0000
8,newyork,2021-11-30 15:27,(Far Rockaway queens ),1 room in a 2 br apt,,,https://newyork.craigslist.org/que/roo/d/far-r...,850,2021-11-30 21:23:09+0000
1072,atlanta,2021-11-28 12:02,(Snellville 78 and Rosebud Rd otp east ),Nice Clean Master Room w Private Bathroom & Ne...,,200.0,https://atlanta.craigslist.org/eat/roo/d/grays...,625,2021-11-30 21:25:14+0000
610,washingtondc,2021-11-30 15:27,(Mt Pleasant district of columbia ),Large room available in fun Mt Pleasant group ...,7.0,,https://washingtondc.craigslist.org/doc/roo/d/...,1200,2021-11-30 21:24:24+0000
516,houston,2021-11-29 14:49,(Houston ),Recamara Privada en Renta,3.0,1500.0,https://houston.craigslist.org/roo/d/houston-r...,650,2021-11-30 21:24:07+0000
536,houston,2021-11-29 07:43,( ),Large Private Room for Rent with Shared Bathroom,,,https://houston.craigslist.org/roo/d/houston-l...,450,2021-11-30 21:24:07+0000
1834,minneapolis,2021-11-28 20:28,(Woodbury washington co / WI ),Lower Level Townhome - Utilities Included,,,https://minneapolis.craigslist.org/wsh/roo/d/s...,895,2021-11-30 21:27:03+0000
496,houston,2021-11-30 09:48,(Spring/The Woodlands ),Large bedroom (newer home),,3000.0,https://houston.craigslist.org/roo/d/spring-la...,730,2021-11-30 21:24:07+0000
