In [3]:
import pandas as pd
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
import requests

In [5]:
url_base = 'http://sfbay.craigslist.org/search/eby/apa'
params = dict(bedrooms=1, is_furnished=1)
rsp = requests.get(url_base, params=params)

In [6]:
# Note that requests automatically created the right URL:
print(rsp.url)

http://sfbay.craigslist.org/search/eby/apa?bedrooms=1&is_furnished=1


In [7]:
print(rsp.text[:500])

﻿<!DOCTYPE html>

<html class="no-js"><head>
    <title>SF bay area apartments / housing rentals  - craigslist</title>

    <meta name="description" content="SF bay area apartments / housing rentals  - craigslist">
    <meta http-equiv="X-UA-Compatible" content="IE=Edge"/>
    <link rel="canonical" href="https://sfbay.craigslist.org/search/eby/apa">
    <link rel="alternate" type="application/rss+xml" href="https://sfbay.craigslist.org/search/eby/apa?bedrooms=1&amp;format=rss&amp;is_furnished=1"


In [8]:
from bs4 import BeautifulSoup as bs4

# BS4 can quickly parse our text, make sure to tell it that you're giving html
html = bs4(rsp.text, 'html.parser')

# BS makes it easy to look through a document
print(html.prettify()[:1000])

﻿
<!DOCTYPE html>
<html class="no-js">
 <head>
  <title>
   SF bay area apartments / housing rentals  - craigslist
  </title>
  <meta content="SF bay area apartments / housing rentals  - craigslist" name="description">
   <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
   <link href="https://sfbay.craigslist.org/search/eby/apa" rel="canonical">
    <link href="https://sfbay.craigslist.org/search/eby/apa?bedrooms=1&amp;format=rss&amp;is_furnished=1" rel="alternate" title="RSS feed for craigslist | SF bay area apartments / housing rentals  - craigslist " type="application/rss+xml">
     <link href="https://sfbay.craigslist.org/search/eby/apa?s=100&amp;bedrooms=1&amp;is_furnished=1" rel="next">
      <meta content="width=device-width,initial-scale=1" name="viewport">
       <link href="//www.craigslist.org/styles/cl.css?v=a1714916751d26ee0ae47420332c611b" media="all" rel="stylesheet" type="text/css">
        <link href="//www.craigslist.org/styles/search.css?v=61ca3f27c9d23e2186e4d

In [9]:
# find_all will pull entries that fit your search criteria.
# Note that we have to use brackets to define the `attrs` dictionary
# Because "class" is a special word in python, so we need to give a string.
apts = html.find_all('p', attrs={'class': 'row'})
print(len(apts))

100


In [10]:
this_appt = apts[15]
print(this_appt.prettify())

<p class="row" data-pid="5638001416" data-repost-of="5398236661">
 <a class="i gallery" data-ids="1:00S0S_2D8zdVAIBEd,1:00e0e_4mwvkM2sgDU,1:00707_1rNpTlqaMFg,1:00U0U_hoja41xlkeH,1:00A0A_8OkZtt9Pqao,1:00B0B_5YlgyqeVlZv,1:00f0f_6Dbeee9Gp4M,1:00T0T_2Cz8NusmInf,1:00T0T_6I5H6bR4C9X,1:00h0h_fMKLyLavCyI" href="/eby/apa/5638001416.html">
 </a>
 <span class="txt">
  <span class="pl">
   <span class="icon icon-star" role="button">
    <span class="screen-reader-text">
     <? __("favorite this post") ?>
    </span>
   </span>
   <time datetime="2016-06-27 01:40" title="Mon 27 Jun 01:40:07 AM">
    Jun 27
   </time>
   <a class="hdrlnk" data-id="5638001416" href="/eby/apa/5638001416.html">
    <span id="titletextonly">
     furnished rooms for rent
    </span>
   </a>
  </span>
  <span class="l2">
   <span class="price">
    $1600
   </span>
   <span class="housing">
    / 1br -
   </span>
   <span class="pnr">
    <small>
     (berkeley)
    </small>
    <span class="px">
     <span class="p">
 

In [11]:
# So now we'll pull out a couple of things we might be interested in:
# It looks like "housing" contains size information. We'll pull that.
# Note that `findAll` returns a list, since there's only one entry in
# this HTML, we'll just pull the first item.
size = this_appt.findAll(attrs={'class': 'housing'})[0].text
print(size)

/ 1br - 


In [12]:
def find_size_and_brs(size):
    split = size.strip('/- ').split(' - ')
    if len(split) == 2:
        n_brs = split[0].replace('br', '')
        this_size = split[1].replace('ft2', '')
    elif 'br' in split[0]:
        # It's the n_bedrooms
        n_brs = split[0].replace('br', '')
        this_size = np.nan
    elif 'ft2' in split[0]:
        # It's the size
        this_size = split[0].replace('ft2', '')
        n_brs = np.nan
    return float(this_size), float(n_brs)
this_size, n_brs = find_size_and_brs(size)

In [13]:
# Now we'll also pull a few other things:
this_time = this_appt.find('time')['datetime']
this_time = pd.to_datetime(this_time)
this_price = float(this_appt.find('span', {'class': 'price'}).text.strip('$'))
this_title = this_appt.find('a', attrs={'class': 'hdrlnk'}).text

In [14]:
# Now we've got the n_bedrooms, size, price, and time of listing
print('\n'.join([str(i) for i in [this_size, n_brs, this_time, this_price, this_title]]))


nan
1.0
2016-06-27 01:40:00
1600.0
furnished rooms for rent


In [15]:
loc_prefixes = ['eby', 'nby', 'sfc', 'sby', 'scz']

In [16]:
def find_prices(results):
    prices = []
    for rw in results:
        price = rw.find('span', {'class': 'price'})
        if price is not None:
            price = float(price.text.strip('$'))
        else:
            price = np.nan
        prices.append(price)
    return prices

def find_times(results):
    times = []
    for rw in apts:
        if time is not None:
            time = time['datetime']
            time = pd.to_datetime(time)
        else:
            time = np.nan
        times.append(time)
    return times

In [17]:
# Now loop through all of this and store the results
results = []  # We'll store the data here
# Careful with this...too many queries == your IP gets banned temporarily
search_indices = np.arange(0, 300, 100)
for loc in loc_prefixes:
    print loc
    for i in search_indices:
        url = 'http://sfbay.craigslist.org/search/{0}/apa'.format(loc)
        resp = requests.get(url, params={'bedrooms': 1, 's': i})
        txt = bs4(resp.text, 'html.parser')
        apts = txt.findAll(attrs={'class': "row"})
        
        # Find the size of all entries
        size_text = [rw.findAll(attrs={'class': 'housing'})[0].text
                     for rw in apts]
        sizes_brs = [find_size_and_brs(stxt) for stxt in size_text]
        sizes, n_brs = zip(*sizes_brs)  # This unzips into 2 vectors
     
        # Find the title and link
        title = [rw.find('a', attrs={'class': 'hdrlnk'}).text
                      for rw in apts]
        links = [rw.find('a', attrs={'class': 'hdrlnk'})['href']
                 for rw in apts]
        
        # Find the time
        time = [pd.to_datetime(rw.find('time')['datetime']) for rw in apts]
        price = find_prices(apts)
        
        # We'll create a dataframe to store all the data
        data = np.array([time, price, sizes, n_brs, title, links])
        col_names = ['time', 'price', 'size', 'brs', 'title', 'link']
        df = pd.DataFrame(data.T, columns=col_names)
        df = df.set_index('time')
        
        # Add the location variable to all entries
        df['loc'] = loc
        results.append(df)
        
# Finally, concatenate all the results
results = pd.concat(results, axis=0)

eby
nby
sfc
sby
scz


In [18]:
# We'll make sure that the right columns are represented numerically:
results[['price', 'size', 'brs']] = results[['price', 'size', 'brs']].convert_objects(convert_numeric=True)


  from ipykernel import kernelapp as app


In [19]:
# And there you have it:
results.head()

Unnamed: 0_level_0,price,size,brs,title,link,loc
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-06-27 11:19:00,2318.0,786.0,1.0,Walk to the Lake and Park!,/eby/apa/5656321646.html,eby
2016-06-27 11:19:00,2500.0,850.0,2.0,2 Bed/1 Bath Apartment Home Coming Available! ...,/eby/apa/5614049056.html,eby
2016-06-27 11:19:00,2995.0,1450.0,1.0,OPEN HOUSE TODAY 11AM-2PM!!! 1BED/1BATH LIVE/W...,/eby/apa/5631998326.html,eby
2016-06-27 11:18:00,1625.0,936.0,2.0,Bob lives here. Don't you want to be like Bob?...,/eby/apa/5656301177.html,eby
2016-06-27 11:18:00,3284.0,498.0,2.0,Two Bedroom Apartment Ready for Move In!,/eby/apa/5656318863.html,eby
