In [1]:
# blog post tutorial followed: 
# https://towardsdatascience.com/web-scraping-craigslist-a-complete-tutorial-c41cea4f4981

In [2]:
#import get to call a get request on the site
from requests import get

#get the first page of the boston car prices
#get only clean titles, with pictures, for Honda Civic

response = get('https://boston.craigslist.org/search/gbs/cta?hasPic=1&max_price=15000&auto_make_model=honda+civic&min_auto_year=2012&auto_paint=2&auto_paint=4&auto_paint=8&auto_title_status=1&auto_transmission=2') 

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser') # Me: in class we use XLML

#get the macro-container for the housing posts
posts = html_soup.find_all('li', class_= 'result-row')
print(type(posts)) #to double check that I got a ResultSet
print(len(posts)) #to double check I got 120 (elements/page)

<class 'bs4.element.ResultSet'>
51


In [3]:
# In order to scale this, make sure to work in the following way: 
# grab the first post and all the variables you want from it, 
# make sure you know how to access each of them for one post 
# before you loop the whole page, and lastly, make sure you 
# successfully scraped one page before adding the loop 
# that goes through all pages.

In [4]:
post_one = posts[0]
post_one

<li class="result-row" data-pid="7039560108" data-repost-of="7016732935">
<a class="result-image gallery" data-ids="1:00H0H_cyPNwERD5O3,1:00i0i_hKbUOeHVqS9,1:00z0z_hylPL9k18Av,1:00202_icS92dU8zai,1:00909_itJxaVEER7h,1:00G0G_ki93Ak3nghD,1:00O0O_41KH1bTyDhm,1:00h0h_gNScth3x6j1,1:00u0u_5OAKgcjICkd,1:00f0f_dGZ8sxkan9O,1:00B0B_lyaXLtnmfT0,1:00505_28sNDYXSP0L,1:00909_5HGLFIJIhgG,1:00H0H_4hMhOzx1EkY,1:00f0f_5BPbRedPBtp,1:00707_6k2GkgBAuI3,1:00404_hdPdpB90iwu" href="https://boston.craigslist.org/gbs/ctd/d/new-town-2019-honda-civic-lx-8500-miles/7039560108.html">
<span class="result-price">$14995</span>
</a>
<p class="result-info">
<span class="icon icon-star" role="button">
<span class="screen-reader-text">favorite this post</span>
</span>
<time class="result-date" datetime="2019-12-16 13:49" title="Mon 16 Dec 01:49:59 PM">Dec 16</time>
<a class="result-title hdrlnk" data-id="7039560108" href="https://boston.craigslist.org/gbs/ctd/d/new-town-2019-honda-civic-lx-8500-miles/7039560108.html">2019

In [5]:
# the price of the post is easy to grab
post_one_price = post_one.a.text
post_one_price = post_one_price.strip() #removes white spaces before and after a string
post_one_price

'$14995'

In [6]:
post_one_time = post_one.time.text
post_one_datetime = post_one.time['datetime']
#post_one_datetime = post_one_datetime['datetime']
post_one_time, post_one_datetime

('Dec 16', '2019-12-16 13:49')

In [7]:
post_one_url = post_one.a['href']
post_one_url

'https://boston.craigslist.org/gbs/ctd/d/new-town-2019-honda-civic-lx-8500-miles/7039560108.html'

In [8]:
post_one_title = post_one.find('a', class_="result-title hdrlnk").text
post_one_title

'2019 HONDA CIVIC LX 8500 MILES ONE OWNER FACTORY SERVICED AND WARRANTY'

In [9]:
post_one_hood = post_one.find('span', class_="result-hood").text.strip().lstrip("(").rstrip(")")
post_one_hood

'BRIGHTON'

In [10]:
#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text)
results_total

24

In [11]:
#build out the loop
#avoid throttling by not sending too many requests one after the other
from time import sleep
import re
from random import randint 
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

In [12]:
#each page has 119 posts so each new page is defined as follows: s=120, 
#s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1, 120)
iterations = 0
post_prices = []
post_timing = []
post_links = []
post_title_texts = []
post_hoods = []

In [13]:
for page in pages:
    
    #get request
    response = get("https://boston.craigslist.org/search/gbs/cta?" 
                   + "s=" #the parameter for defining the page number 
                   + str(page) #the page number in the pages array from earlier
                   + "&hasPic=1"
                   + "&auto_make_model=honda+civic&auto_title_status=1")

    sleep(randint(1,5))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
    #define the html text
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    #define the posts
    posts = html_soup.find_all('li', class_= 'result-row')
        
    #extract data item-wise
    for post in posts:

        if post.find('span', class_ = 'result-hood') is not None:

            #posting date
            #grab the datetime element 0 for date and 1 for time
            post_datetime = post.find('time', class_= 'result-date')['datetime']
            post_timing.append(post_datetime)

            #neighborhoods
            post_hood = post.find('span', class_= 'result-hood').text
            post_hoods.append(post_hood)

            #title text
            post_title = post.find('a', class_='result-title hdrlnk')
            post_title_text = post_title.text
            post_title_texts.append(post_title_text)

            #post link
            post_link = post_title['href']
            post_links.append(post_link)
            
            #removes the \n whitespace from each side, removes the currency symbol, and 
            #turns it into an int
            post_price = int(post.a.text.strip().replace("$", "")) 
            post_prices.append(post_price)
            
#             if post.find('span', class_ = 'housing') is not None:
                
#                 #if the first element is accidentally square footage
#                 if 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:
                    
#                     #make bedroom nan
#                     bedroom_count = np.nan
#                     bedroom_counts.append(bedroom_count)
                    
#                     #make sqft the first element
#                     sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
#                     sqfts.append(sqft)
                    
#                 #if the length of the housing details element is more than 2
#                 elif len(post.find('span', class_ = 'housing').text.split()) > 2:
                    
#                     #therefore element 0 will be bedroom count
#                     bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
#                     bedroom_counts.append(bedroom_count)
                    
#                     #and sqft will be number 3, so set these here and append
#                     sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
#                     sqfts.append(sqft)
                    
#                 #if there is num bedrooms but no sqft
#                 elif len(post.find('span', class_ = 'housing').text.split()) == 2:
                    
#                     #therefore element 0 will be bedroom count
#                     bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
#                     bedroom_counts.append(bedroom_count)
                    
#                     #and sqft will be number 3, so set these here and append
#                     sqft = np.nan
#                     sqfts.append(sqft)                    
                
#                 else:
#                     bedroom_count = np.nan
#                     bedroom_counts.append(bedroom_count)
                
#                     sqft = np.nan
#                     sqfts.append(sqft)
                
#             #if none of those conditions catch, make bedroom nan, this won't be needed    
#             else:
#                 bedroom_count = np.nan
#                 bedroom_counts.append(bedroom_count)
                
#                 sqft = np.nan
#                 sqfts.append(sqft)
#             #    bedroom_counts.append(bedroom_count)
                
#             #    sqft = np.nan
#             #    sqfts.append(sqft)
                
    iterations += 1
    print("Page " + str(iterations) + " scraped successfully!")

print("\n")

print("Scrape complete!")

Page 1 scraped successfully!


Scrape complete!


In [14]:
# The code below creates the dataframe from the lists of values!
import pandas as pd

cars = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                        'URL': post_links,
                       'price': post_prices})
print(cars.info())
cars.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
posted          32 non-null object
neighborhood    32 non-null object
post title      32 non-null object
URL             32 non-null object
price           32 non-null int64
dtypes: int64(1), object(4)
memory usage: 1.3+ KB
None


Unnamed: 0,posted,neighborhood,post title,URL,price
0,2019-12-16 13:49,(BRIGHTON),2019 HONDA CIVIC LX 8500 MILES ONE OWNER FACTO...,https://boston.craigslist.org/gbs/ctd/d/new-to...,14995
1,2019-12-16 07:11,($1000down$74week/ToplineImport.com),2016 Honda Civic EX 2.0 Sedan/40k/You are APPR...,https://boston.craigslist.org/gbs/ctd/d/haverh...,0
2,2019-12-15 00:50,"(NEWTON, MA)",2017 HONDA CIVIC LX SEDAN ONE OWNER 35k MILES ...,https://boston.craigslist.org/gbs/ctd/d/west-n...,13200
3,2019-12-15 00:44,"(NEWTON, MA)",2016 HONDA CIVIC LX SEDAN ONE OWNER 38k MILES ...,https://boston.craigslist.org/gbs/ctd/d/west-n...,12500
4,2019-12-14 13:27,(BRIGHTON),2019 HONDA CIVIC LX 8500 MILES ONE OWNER FACTO...,https://boston.craigslist.org/gbs/ctd/d/new-to...,14995
5,2019-12-13 15:07,(BRIGHTON),2019 HONDA CIVIC LX 8500 MILES ONE OWNER FACTO...,https://boston.craigslist.org/gbs/ctd/d/new-to...,14995
6,2019-12-13 07:25,(Everett),Honda civic EXL,https://boston.craigslist.org/gbs/cto/d/everet...,6000
7,2019-12-12 11:55,(Attleboro),2012 Honda Civic LX 4dr Sedan 5A 147244 Miles,https://boston.craigslist.org/gbs/ctd/d/attleb...,6500
8,2019-12-11 17:35,(Imotobank),2012 Honda Civic LX 4dr Sedan 5A,https://boston.craigslist.org/gbs/ctd/d/south-...,6975
9,2019-12-11 17:31,(Imotobank),2014 Honda Civic EX 2dr Coupe CVT,https://boston.craigslist.org/gbs/ctd/d/south-...,10975


In [15]:
pd.DataFrame.to_csv(cars,'cars.csv', index=False)