In [1]:
#https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981
#Importing all of the required packages
from requests import get
from bs4 import BeautifulSoup as soup
import numpy as np
from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
import pandas as pd

In [2]:
#Url that we want to scrape
url = "https://sfbay.craigslist.org/search/sfc/apa?hasPic=1"

In [3]:
response = get(url)
page_soup = soup(response.text, "html.parser")

#get the html container of the housing posts
posts = page_soup.find_all('li', class_= 'result-row')

In [4]:
#1. Let's get the neighbourhood
hood = posts[0].find('span', class_ = 'result-hood')
hood.text.strip().replace('(','').replace(')','')

'nob hill'

In [5]:
#2. Let's get the price
price = posts[0].find('span', class_ = 'result-price')
price.text

'$4200'

In [6]:
#3. and 4. Let's get bedrooms  and sqft
bedding = posts[0].find('span', class_ = 'housing')


if bedding is None:
    number_bedroom = ''
    sqft = ''
else:
    test = bedding.text.strip().split(" ")
    testlist = ' '.join(test).split()
    
    if len(testlist) == 4:
        number_bedroom = testlist[0]
        sqft = testlist[2]
    
    elif testlist[0][-2:] == 'br':
        number_bedroom = testlist[0]
        sqft = ''
    
    elif testlist[0][-3:] == 'ft2':
        number_bedroom = ''
        sqft = testlist[0]
            
print(number_bedroom, sqft)

1br 


In [8]:
#5. Let's get the date and time of the post
datetime = posts[0].find('time', class_= 'result-date')['datetime']
datetime

'2019-04-26 14:56'

In [11]:
#6. Let's get the title of the post
post_title = posts[0].find('a', class_='result-title hdrlnk').text
post_title

'Furnished 1BR Apartment | 50 Joice Street | Nob Hill'

In [12]:
#7. Get the links to the posts
post_link = posts[0].find('a', class_='result-title hdrlnk')['href']
post_link

'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-furnished-1br-apartment/6875599248.html'

In [13]:
#Build the loop on all the listings and on all the pages
results_num = page_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
pages = np.arange(0, results_total+1, 120)
pages

array([   0,  120,  240,  360,  480,  600,  720,  840,  960, 1080, 1200,
       1320, 1440, 1560, 1680, 1800, 1920, 2040, 2160, 2280, 2400, 2520,
       2640, 2760, 2880, 3000])

In [14]:
#initializing some of the arrays we will need once we loop
iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []

In [15]:
for page in pages:

    #get requests
    response = get("https://sfbay.craigslist.org/search/sfc/apa?"
                   + "s=" #the parameter for defining the page number
                   + str(page) #the page number in the pages array from earlier
                   + "&hasPic=1")


    sleep(randint(1,5))

    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #define the html text
    page_html = soup(response.text, 'html.parser')

    #define the posts
    posts = page_html.find_all('li', class_= 'result-row')

    for post in posts:
        if post.find('span', class_ = 'result-hood') is not None:
            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)

            #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
            post_price = int(post.a.text.strip().replace("$", ""))
            post_prices.append(post_price)

            
            bedding = post.find('span', class_ = 'housing')
            if bedding is None:
                number_bedroom = ''
                sqft = ''
                bedroom_counts.append(number_bedroom)
                sqfts.append(sqft)
                
            else:
                test = bedding.text.strip().split(" ")
                testlist = ' '.join(test).split()
                
                if len(testlist) == 4:
                    number_bedroom = testlist[0]
                    sqft = testlist[2]
                    bedroom_counts.append(number_bedroom)
                    sqfts.append(sqft)
                
                elif testlist[0][-2:] == 'br':
                    number_bedroom = testlist[0]
                    sqft = ''
                    bedroom_counts.append(number_bedroom)
                    sqfts.append(sqft)
                    
                elif testlist[0][-3:] == 'ft2':
                    number_bedroom = ''
                    sqft = testlist[0]
                    bedroom_counts.append(number_bedroom)
                    sqfts.append(sqft)


    iterations += 1
    print("Page " + str(iterations) + " scraped")

print("\n")
print("All scraped")

Page 1 scraped
Page 2 scraped
Page 3 scraped
Page 4 scraped
Page 5 scraped
Page 6 scraped
Page 7 scraped
Page 8 scraped
Page 9 scraped
Page 10 scraped
Page 11 scraped
Page 12 scraped
Page 13 scraped
Page 14 scraped
Page 15 scraped
Page 16 scraped
Page 17 scraped
Page 18 scraped
Page 19 scraped
Page 20 scraped
Page 21 scraped
Page 22 scraped
Page 23 scraped
Page 24 scraped
Page 25 scraped
Page 26 scraped


All scraped


In [19]:
sf_apts = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                       'number bedrooms': bedroom_counts,
                        'sqft': sqfts,
                        'URL': post_links,
                       'price': post_prices})

In [20]:
print(sf_apts.info())
sf_apts.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2956 entries, 0 to 2955
Data columns (total 7 columns):
posted             2956 non-null object
neighborhood       2956 non-null object
post title         2956 non-null object
number bedrooms    2956 non-null object
sqft               2956 non-null object
URL                2956 non-null object
price              2956 non-null int64
dtypes: int64(1), object(6)
memory usage: 161.7+ KB
None


Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price
0,2019-04-26 14:42,(mission district),Mission Dolores Park Views/240 Cumberland/W/D ...,1br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,4195
1,2019-04-26 14:41,(marina / cow hollow),OPEN SUN 2:30PM-3:15PM 3BR/2BA Quiet House w/...,3br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,5795
2,2019-04-26 14:41,(laurel hts / presidio),2 bedroom beach apartment overlooking Baker Beach,2br,1200ft2,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3790
3,2019-04-26 14:40,(inner sunset / UCSF),Inner sunset/1290 20thAve.&Irving/Available Now!!,1br,547ft2,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3095
4,2019-04-26 14:40,(russian hill),Russian Hill | 2129 Hyde Street | Furnished 2B...,2br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,6500
5,2019-04-26 14:39,(SOMA / south beach),"3 Month Furnished 1BR, 2BA SOMA LOFT Available...",1br,906ft2,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3900
6,2019-04-26 14:38,(mission district),Mission Dolores Park/3875 18th&Sanchez/Muni/Av...,2br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3895
7,2019-04-26 14:37,(downtown / civic / van ness),Downtown/1050 Post&Polk/Remodeled/Available Now!!,1br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,2795
8,2019-04-26 14:35,(russian hill),Russian Hill | 2123 Hyde Street | Furnished 3B...,3br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,9500
9,2019-04-26 14:35,(SOMA / south beach),"Electric Car Charging Stations, Washer and dry...",1br,,https://sfbay.craigslist.org/sfc/apa/d/san-fra...,3685


In [56]:
sf_apts.to_csv('sfapts.csv', sep=',', encoding='utf-8')

'3 New Asian Calligraphy Brushes Bamboo with natural bristles'