# Concept
Input: directory to save html files
START_URL = 'https://sfbay.craigslist.org/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1'

For San Francisco, parse all 37 neighborhood codes
For each hood
    For each search page
        Make a request
        Parse all listing links
        For each listing_link
            Request the listing_link and save into a file with the filename being hash of listing url




In [2]:
!pip install htmlmin

[33mYou are using pip version 9.0.1, however version 22.2.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [3]:
import requests
import bs4
import hashlib
from datetime import datetime
import time
import random
import os
import csv
import htmlmin
from bs4 import BeautifulSoup
import re 
import pandas as pd



# Inputs

In [32]:
ROOT_URL = 'https://sfbay.craigslist.org'
START_URL = ROOT_URL + '/search/sfc/apa?availabilityMode=0&bundleDuplicates=1&hasPic=1'
PARENT_DIR = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw'
DEBUG = True

# Functions

In [38]:
def get_file_name(title):
    return hashlib.sha256(title.encode('utf-8')).hexdigest()


def get_directory_name():
    return datetime.today().replace(microsecond=0).isoformat()


def create_directory():
    directory = get_directory_name()
    path = os.path.join(PARENT_DIR, directory)
    os.mkdir(path)
    return path


def get_neighborhood_url(neighborhood_code):
    return START_URL + '&nh={}'.format(neighborhood_code)


def get_neighborhoods(url):
    neighborhood_codes = []
    r = requests.get(url)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    for neighborhood_input in soup.find_all('input', {'name':'nh'}):
        neighborhood_codes.append((neighborhood_input.get('value')))
    return neighborhood_codes[:3] if DEBUG else neighborhood_codes


def next_search_page(page):
    r = requests.get(page)
    r.raise_for_status()
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    next_page_element = soup.find('a', class_="button next")
    return ROOT_URL + next_page_element.get('href')


# Extracts all pages of listing urls for a given search url
def get_listings(search_url):
    listing_urls = []
    while search_url != ROOT_URL:
        r = requests.get(search_url)
        r.raise_for_status()
        if r.status_code != 200:
            print('Request: {}; Status code: {}'.format(r, r.status_code))

        soup = bs4.BeautifulSoup(r.text, 'html.parser')
        for a in soup.find_all("a", class_="result-title hdrlnk"):
            listing_url = a.get('href')
            listing_urls.append(listing_url)

        # Add a delay between requests
        time.sleep(random.uniform(0.5, 1.5))
        
        # Check if there is another search page
        search_url = next_search_page(search_url)
    return listing_urls[:5] if DEBUG else listing_urls


def get_all_listings():
    listing_urls = []
    for neighborhood in get_neighborhoods(START_URL):
        search_url = get_neighborhood_url(neighborhood)
        neighborhood_listing_urls = get_listings(search_url)
        listing_urls.extend(neighborhood_listing_urls)
        print('Found {} urls for neighborhood {}'.format(len(neighborhood_listing_urls), neighborhood))
    return listing_urls


# Saves html from each listing to file
def save_html_to_file(url, file_path):
    # Get the html of an individual listing page
    try:
        r = requests.get(url)
    except:
        print('Failed to download url: {}'.format(url))
        return

    listing_path = os.path.join(file_path, get_file_name(url))
    html = r.text

    # Minify html file (reduces file size by 24%)
    minified = htmlmin.minify(html)

    # Save the html to file
    with open(listing_path, 'w', encoding='utf-8') as f:
        f.write(minified)

# Scraping

In [36]:
listing_urls = get_all_listings()
# print(listing_urls)

Found 165 urls for neighborhood 149
Found 96 urls for neighborhood 110
Found 66 urls for neighborhood 3
Found 138 urls for neighborhood 4
Found 34 urls for neighborhood 5
Found 307 urls for neighborhood 6
Found 62 urls for neighborhood 7
Found 174 urls for neighborhood 8
Found 33 urls for neighborhood 9
Found 35 urls for neighborhood 11
Found 385 urls for neighborhood 12
Found 119 urls for neighborhood 13
Found 180 urls for neighborhood 14
Found 165 urls for neighborhood 15
Found 63 urls for neighborhood 16
Found 25 urls for neighborhood 10
Found 457 urls for neighborhood 20
Found 97 urls for neighborhood 24
Found 168 urls for neighborhood 17
Found 365 urls for neighborhood 18
Found 546 urls for neighborhood 19
Found 136 urls for neighborhood 21
Found 159 urls for neighborhood 22
Found 195 urls for neighborhood 23
Found 21 urls for neighborhood 164
Found 340 urls for neighborhood 25
Found 191 urls for neighborhood 26
Found 178 urls for neighborhood 27
Found 1416 urls for neighborhood 1

In [39]:
file_path = create_directory()
for i,listing_url in enumerate(listing_urls):
    save_html_to_file(listing_url, file_path)
    time.sleep(random.uniform(0.5, 1.5))
    if i % 100 == 0:
        print('Downloaded {} urls'.format(i))

Downloaded 0 urls
Downloaded 100 urls
Downloaded 200 urls
Downloaded 300 urls
Downloaded 400 urls
Downloaded 500 urls
Downloaded 600 urls
Failed to download url: https://sfbay.craigslist.org/sfc/apa/d/san-francisco-new-building-new/7515409220.html


KeyboardInterrupt: 

# Debugging

In [50]:
path = '/Users/pandabear/springboard/CapstoneTwoProject/data/raw/2022-08-09T22:52:55'
file_list = os.listdir(path)
for file in file_list:
    with open(os.path.join(path, file), 'r', encoding='utf-8') as f:
        x = f.read()
        print(len(x))

29430
27679
22333
31346
23182
21543
24471
22618
23935
26045
22653
23867
32088
24088
25397
21129
23172
32295
24283
24576
22212
31099
32509
22205
20325
29443
30064
33536
22258
21520
27283
31572
19356
16761
20071
24500
25484
25705
23294
22953
24541
28749
22202
30719
19088
33523
25908
24944
20677
23044
22337
30362
32453
21517
28124
24431
22315
23948
22561
23187
29907
32507
23374
22353
21945
24783
22351
24773
31214
23745
24873
26588
24082
26733
30453
24909
23190
30421
23077
25794
23363
31638
23262
32094
24910
30680
22312
23012
23027
23926
20897
29490
22241
27702
23163
23994
29493
32170
28736
22617
26254
22741
27728
28438
25848
30986
32737
26010
22320
25795
18630
24854
22015
22780
23039
22215
30904
29324
25618
28280
24960
22750
30921
22326
25410
26153
22260
28476
24785
23775
23715
31170
29329
23301
22163
28149
19422
16451
20657
26854
23288
23078
23139
27728
24057
22315
28215
26304
23932
25109
19841
31944
24574
21714
32487
18400
21209
27343
24638
22029
23325
23828
24399
18359
22167
22465
2719

In [49]:
print(get_directory_name())

2022-08-10T08:44:53
