# Concept
Input: directory to save html files
START_URL = 'https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1'

For San Francisco, parse all 37 neighborhood codes
For each hood
    For each search page
        Make a request
        Parse all listing links
        For each listing_link
            Request the listing_link and save into a file with the filename being hash of listing url




In [2]:
!pip install htmlmin

[33mYou are using pip version 9.0.1, however version 22.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import requests
import bs4
import hashlib
from datetime import datetime
import time
import random
import os
import csv
import htmlmin
from bs4 import BeautifulSoup
import re 
import pandas as pd



# Inputs

In [4]:
ROOT_URL = 'https://sfbay.craigslist.org'
START_URL = ROOT_URL + '/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1'
PARENT_DIR = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw'
DEBUG = True

# Functions

In [13]:
def get_file_name(title):
    return hashlib.sha256(title.encode('utf-8')).hexdigest()


def get_directory_name():
    return datetime.today().replace(microsecond=0).isoformat()


def create_directory():
    directory = get_directory_name()
    path = os.path.join(PARENT_DIR, directory)
    os.mkdir(path)
    return path


def get_neighborhood_url(neighborhood_code):
    return START_URL + '&nh={}'.format(neighborhood_code)


def get_neighborhoods(url):
    neighborhood_codes = []
    r = requests.get(url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    for neighborhood_input in soup.find_all('input', {'name':'nh'}):
        neighborhood_codes.append((neighborhood_input.get('value')))
    return neighborhood_codes[:3] if DEBUG else neighborhood_codes


def next_search_page(page):
    r = requests.get(page)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    next_page_element = soup.find('a', class_="button next")
    return ROOT_URL + next_page_element.get('href')


# Extracts all pages of listing urls for a given search url
def get_listings(search_url):
    listing_urls = []
    while search_url != ROOT_URL:
        r = requests.get(search_url)
        r.raise_for_status()
        if r.status_code != 200:
            warn('Request: {}; Status code: {}'.format(r, r.status_code))

        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        for a in soup.find_all("a", class_="result-title hdrlnk"):
            listing_url = a.get('href')
            listing_urls.append(listing_url)

        # Add a delay between requests
        time.sleep(random.uniform(0.5, 1.5))
        
        # Check if there is another search page
        search_url = next_search_page(search_url)
    return listing_urls[:5] if DEBUG else listing_urls


def get_all_listings():
    listing_urls = []
    for neighborhood in get_neighborhoods(START_URL):
        search_url = get_neighborhood_url(neighborhood)
        neighborhood_listing_urls = get_listings(search_url)
        listing_urls.extend(neighborhood_listing_urls)
        print('Found {} urls for neighborhood {}'.format(len(neighborhood_listing_urls), neighborhood))
    return listing_urls


# Saves html from each listing to file
def save_html_to_file(url, file_path):

    # Get the html of an individual listing page
    r = requests.get(url)
    
    r.raise_for_status()
    if r.status_code != 200:
        warn('Request: {}; Status code: {}'.format(r, response.status_code))

    listing_path = os.path.join(file_path, get_file_name(url))
    html = r.text

    # Minify html file (reduces file size by 24%)
    minified = htmlmin.minify(html)

    # Save the html to file
    with open(listing_path, 'w', encoding='utf-8') as f:
        f.write(minified)

# Scraping

In [14]:
listing_urls = get_all_listings()
# print(listing_urls)

Found 5 urls for neighborhood 149
Found 5 urls for neighborhood 110
Found 5 urls for neighborhood 3
['https://sfbay.craigslist.org/sfc/apa/d/san-francisco-beautiful-and-spacious/7519517482.html', 'https://sfbay.craigslist.org/sfc/apa/d/sunnyvale-extra-storage-electric/7510821773.html', 'https://sfbay.craigslist.org/sfc/apa/d/millbrae-most-advanced-beautiful-home/7519481159.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-spacious-2bd-conv-3bd-1ba/7519480696.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-jose-2b1ba-apartment-home-with/7519474108.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-community-bbqs-2nd-floor/7519444371.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-energy-efficient-and/7519362122.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-popcorn-daily-bench-press/7519287978.html', 'https://sfbay.craigslist.org/sfc/apa/d/san-francisco-remodeled-live-work-or/7510997742.html', 'https://sfbay.craigslist.org/sfc/apa/d/san

In [15]:
file_path = create_directory()
for i,listing_url in enumerate(listing_urls):
    save_html_to_file(listing_url, file_path)
    time.sleep(random.uniform(0.5, 1.5))
    if i % 5 == 0:
        print('Downloaded {} urls'.format(i))

Downloaded 0 urls
Downloaded 5 urls
Downloaded 10 urls


# Debugging

In [31]:
file_path1 = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw/2022-08-09T20:26:20/'
file_path2 = '3f0a4b85bba92fd03801b14e4711c9d0a14069a1086779e39d289cdf07cd851a'
with open(file_path1+file_path2,'r') as f:
    print(f.read())

<!DOCTYPE html><html> <head><meta charset=UTF-8><meta http-equiv=X-UA-Compatible content="IE=Edge"><meta name=viewport content="width=device-width,initial-scale=1"><meta property=og:site_name content=craigslist><meta name=twitter:card content=preview><meta property=og:title content="Popcorn Daily, Bench Press, Free WIFI - apts/housing for rent -..."><meta name=description content="Welcome to Waterbend - our amazing pet-friendly Community! Welcome to Waterbend San Francisco Call Now - x 7 OR Text 7 to to contact our team. This is a 2 Bedroom, 2 Bath, approximately 946 Sq. Ft...."><meta property=og:description content="Welcome to Waterbend - our amazing pet-friendly Community! Welcome to Waterbend San Francisco Call Now - x 7 OR Text 7 to to contact our team. This is a 2 Bedroom, 2 Bath, approximately 946 Sq. Ft...."><meta property=og:image content=https://images.craigslist.org/00d0d_9QuXXjKvfARz_0eQ0ax_600x450.jpg><meta property=og:url content=https://sfbay.craigslist.org/sfc/apa/d/san-