In [12]:
import requests
from bs4 import BeautifulSoup
import sys
import pandas as pd
import numpy as np
import re
import csv

In [18]:
# create csvfile
header = ['title', 'rental price','bedrooms','bathrooms','Area (SF)','latitude,longitude', \
          'cats are OK - purrr','dogs are OK - wooof','w/d in unit', 'wd in unit', \
             'laundry in bldg', 'laundry on site', 'attached garage',  \
             'detached garage', 'carport', 'furnished', 'no smoking', 'wheelchair accessible' ]
with open('SFrentals.csv','w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)

In [2]:
def fetch_search_results(query=None, minAsk=None, maxAsk=None, bedrooms=None, s=None):
    search_params = {
        key: val for key, val in locals().items() if val is not None
    }
    if not search_params:
        raise ValueError("No valid keywords")

    base = 'https://sfbay.craigslist.org/search/sfc/apa'
    resp = requests.get(base, params=search_params, timeout=3)
    resp.raise_for_status()  # <- no-op if status==200
    return resp.content, resp.encoding

# keywords: query=keyword+values+here
# price: minAsk=NNN maxAsk=NNN
# bedrooms: bedrooms=N

# s: page number in incremetn of 120, 
# s = None for page 1, s = 120 for page 2, s = 240 for page 3

In [252]:
# search for listings based on inputted criterias: 
x = fetch_search_results('apartment',100,15000,2,960)

# Search #1: x = fetch_search_results('apartment',1000,15000,3,240) # this only returned 96 listings as oppose to 120
# Search #2: x = fetch_search_results('apartment',1000,15000,3,120)
# Search #3: x = fetch_search_results('apartment',1000,15000,3)
# Search #4: x = fetch_search_results('apartment',1000,15000,2)
# switched to VPN 
# Search #5: x = fetch_search_results('apartment',1000,15000,2,120)
# note there is a difference of 48 rows at this check point

# Search #6: x = fetch_search_results('apartment',1000,15000,2,240)
# Search #7: x = fetch_search_results('apartment',1000,15000,2,360)
# Search #8: x = fetch_search_results('apartment',1000,15000,2,480)
# Search #9: x = fetch_search_results('apartment',1000,15000,2,600)
# Search #10: x = fetch_search_results('apartment',1000,15000,1)
# Search #11: x = fetch_search_results('apartment',1000,15000,1,120)
# Search #12: x = fetch_search_results('apartment',1000,15000,1,240)
# Search #13: x = fetch_search_results('apartment',1000,15000,1,360)
# Search #14: x = fetch_search_results('apartment',1000,15000,1,480)
# Search #15: x = fetch_search_results('apartment',1000,15000,1,600)
# Search #16: x = fetch_search_results('apartment',1000,15000,4) # note this returned 154 links!
# Search #17: x = fetch_search_results('apartment',1000,15000,4,120) # note this returned 0
# Search #18: x = fetch_search_results('apartment',1000,15000,5) # returend 34 links!
# Search #19: x = fetch_search_results('apartment',1000,15000,6) # returned 6 links!
# Search #20: x = fetch_search_results('apartment',1000,20000,7) # returend 3 links
# Search #21: x = fetch_search_results('apartment',1000,30000,8) # returned 1 link
# Search #22: x = fetch_search_results('apartment',1000,30000,1,720) 
# Search #23: x = fetch_search_results('apartment',1000,30000,1,840)
# Search #24: x = fetch_search_results('apartment',1000,30000,1,960)
# Search #25: x = fetch_search_results('apartment',100,1000,1) # returned 128 links
# Search #26: x = fetch_search_results('apartment',100,1000,1,120) #returned 0
# Search #27: x = fetch_search_results('apartment',100,1000,2) # returned 120
# Search #28: x = fetch_search_results('apartment',100,15000,2,720)
# Search #29: x = fetch_search_results('apartment',100,15000,2,840)
# Search #30: x = fetch_search_results('apartment',100,15000,2,960)

In [253]:
# choose first element of the returned list for the html text
htmlstring = str(x[0])

In [254]:
# convert to lxml
soup = BeautifulSoup(htmlstring,'lxml')

In [255]:
# select links of each listing
links = []
for link in soup.find_all('a', {'class': 'result-image'}):
    links.append(link.get('href'))

In [256]:
upper_limit = len(links)
print(len(links))

120


In [257]:
# create dictionary of keys for categorical features
dict_keys = ('cats are OK - purrr','dogs are OK - wooof','w/d in unit', 'wd in unit', \
             'laundry in bldg', 'laundry on site', 'attached garage',  \
             'detached garage', 'carport', 'furnished', 'no smoking', 'wheelchair accessible')
categories = {key: 0 for key in dict_keys}
print(len(categories))
print(categories)


12
{'cats are OK - purrr': 0, 'dogs are OK - wooof': 0, 'w/d in unit': 0, 'wd in unit': 0, 'laundry in bldg': 0, 'laundry on site': 0, 'attached garage': 0, 'detached garage': 0, 'carport': 0, 'furnished': 0, 'no smoking': 0, 'wheelchair accessible': 0}


In [258]:
# initiate empty list for csv file
csv_rows_list = []

# initiate empty list to hold each row in csv file
csv_rows = []

# created counter for skipped rows (deleted postings on Craiglist)
skipped_row_count = 0


# iterate through each link to extract relevant info
for i in range(upper_limit):
#     print(links[i])   #CHECK LINK
    
    info = requests.get(links[i])
    soup = BeautifulSoup(info.content,'lxml')
    

    # returns the full title description of listing, which is not exactly necessary
    for title in soup.find_all(id='titletextonly'):  
        csv_rows.append(title.text)
        
    
    # returns rent price/month
    for rent in soup.find_all(class_='price'):  
        dollars = int(rent.text.strip('$'))    
        csv_rows.append(dollars)
        

    # returns number of bedrooms as int, number of bathrooms as float, square footage as int
    bed_bath_categories = []
    shared_bubble = soup.find_all(class_='shared-line-bubble')
    for bed_bath in shared_bubble:  
        ary = bed_bath.text.replace(" ","").split("/")
        if "available" in ary[0]:
            pass
        else:
            bed_bath_categories.append(ary)
    bed_bath_categories = [item for item in bed_bath_categories if item]
    
    
    # append number of beds to csv_rows
    try: 
        beds=bed_bath_categories[0][0]
        num_beds = [int(i) for i in re.findall(r'\d+', beds)][0] 
        csv_rows.append(num_beds)
    except IndexError:
        csv_rows.append('')
        print('skipped beds')  # Check for absence of value --> indicator of deleted posting
    
    # append number of bath to csv_rows
    try: 
        baths=bed_bath_categories[0][1]
        num_baths = [float(i) for i in re.findall(r'\d*\.?\d+', baths)][0]
        csv_rows.append(num_baths)
    except IndexError:
        csv_rows.append('')    
        print('skipped baths')  # Check for absence of value --> indicator of deleted posting
        
    # append sqft to csv_rows
    try: 
        if len(bed_bath_categories)>1:
            area = bed_bath_categories[1][0]
            num_area = [int(i) for i in re.findall(r'\d+', area)][0]
            csv_rows.append(num_area)
        else:
            csv_rows.append('')
    except IndexError:
        csv_rows.append('')    
        print('skipped area')  # Check for absence of value --> indicator of deleted posting
     

    # returns latitude and longitude
    location_attrs = {'data-latitude': True, 'data-longitude': True}
    listings = soup.find_all(id='map', attrs=location_attrs)
    try: 
        for listing in listings:
            location = {key: listing.attrs.get(key, '') for key in location_attrs}
        lat = location['data-latitude']
        long = location['data-longitude']
        latlong = (float(lat),float(long))
        csv_rows.append(latlong)
    except IndexError:
        csv_rows.append('')
    
    
    # returns 1 or 0 for the category features
    text_list=[]
    for misc in soup.find_all('p', class_='attrgroup'):   
        text = misc.text
        text_list.append(text)
    
    # select last element of list to exclude redundant info
    correct_text = text_list[-1]
    factors = correct_text.replace('/','').split('\n')
    factors = [i.replace('\n','') for i in factors if len(i)>0]

    # set values to 1 if factors is in the dictionary, "categories"
    for string in factors:
        if string in dict_keys:
            categories[string]=1
    [csv_rows.append(val) for val in list(categories.values())]

    # Append csv_rows to csv_rows_list, and check for deleted postings
    if len(csv_rows) == 18:
        csv_rows_list.append(csv_rows)
        print(len(csv_rows))
    else:
        print('skipped row')
        skipped_row_count += 1  # counter for deleted postings
    
    # reset csv_rows & the dictionary, "categories"
    csv_rows = []
    categories = {key: 0 for key in dict_keys}

print(csv_rows_list)
print(skipped_row_count)

18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
[['Beautiful 2 or 3 bedroom flat avalible', 4900, 2, 1.0, 1100, (37.761078, -122.440985), 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0], ['1255 10th Av/Irving Apt remod 2BR 1BA new wood fl liv rm kitc pkg', 3200, 2, 1.0, 800, (37.76399, -122.467383), 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0], ['Step into the foyer that seamlessly flows $$720', 720, 2, 2.0, '', (33.2407, -117.3025), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ['Victorian Flat - Great Location', 5900, 3, 1.5, 1200, (37.781732, -122.437623), 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], ['$2450 sunset 日落区楼上2房1厅1厨', 2450, 2, 1.0, '', (37.781732, -122.437623), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], ['Gor

In [259]:
# append to csv file
with open('SFrentals.csv', 'a') as csvfile:
    writer = csv.writer(csvfile)
    for line in csv_rows_list:
        writer.writerow(line)
