In [None]:
## Craigslist Scrapping Tool

## First script scrapes from first page
## Second script scrapes from all pages
## WARNING the second script can get you blocked pretty quickly

In [2]:
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

URL = 'https://knoxville.craigslist.org/search/cta'

# Set up the webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # This runs Chrome in headless mode (no GUI).
driver = webdriver.Chrome(options=options)

driver.get(URL)

# Wait for the page to fully load (adjust the timeout as needed)
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.gallery-card')))

# Get the final HTML content
html = driver.page_source

# Parse the HTML using BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
results = []

# Extracting car details
for listing in soup.find_all('div', class_='gallery-card'):
    car_details = {}
    
    car_details["title"] = listing.find('span', class_='label').text if listing.find('span', class_='label') else "No title"
    car_details["price"] = listing.find('span', class_='priceinfo').text if listing.find('span', class_='priceinfo') else "No price"
    
    meta = listing.find("div", class_='meta')
    if meta:
        meta_parts = meta.text.split('·')
        if len(meta_parts) >= 3:
            car_details["miles"] = meta_parts[1].strip().replace(' mi', '')  # This should give something like "113k"
        else:
            car_details["miles"] = "No Miles"
    else:
        car_details["miles"] = "No Miles"

    if car_details["price"] != "No price":
        results.append(car_details)

"""# Save to CSV
with open('results.csv', 'w', newline='') as csvfile:
    fieldnames = ['title', 'price', 'miles']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for car in results:
        writer.writerow(car)
"""
print(f"Saved {len(results)} results to results.csv")



driver.quit()


35 mins ago·5,546mi
41 mins ago·113k mi·Oak Ridge, TN
46 mins ago·72k mi·Call *(865) 672-0650* to Confirm Availability Instantly
50 mins ago·109k mi·+ Tennessee Auto Network
50 mins ago·162k mi·+ Tennessee Auto Network
54 mins ago·285k mi·Athens, TN
1h ago·42k mi·** NO DOC FEES **
1h ago·228k mi·Blaine
1h ago·277k mi·knoxville
2h ago·116k mi·Call *(865) 672-0650* to Confirm Availability Instantly
3h ago·240k mi·Call *(865) 672-0650* to Confirm Availability Instantly
4h ago·300k mi·Knoxville
4h ago·90k mi·Knoxville
10/4·113k mi·Volvo XC90
10/4·87k mi·ROGERSVILLE
10/4·213k mi·West Knox / Downtown
10/4·108k mi·Knoxville, TN
10/4·133k mi·Harriman
10/4·46k mi·4121 Lexington Road Paris, KY 40361
10/4·78k mi·Harriman
10/4·12k mi·4121 Lexington Road Paris, KY 40361
10/4·1,180mi·4121 Lexington Road Paris, KY 40361
10/4·204k mi
10/4·47k mi·4121 Lexington Road Paris, KY 40361
10/4·6,651mi·4121 Lexington Road Paris, KY 40361
10/4·162mi·4121 Lexington Road Paris, KY 40361
10/4·32k mi·4121 Lexington

In [18]:
## Check the max page length 

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

BASE_URL = 'https://knoxville.craigslist.org/search/cta#search=1~gallery~{}~100'

# Set up the webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # This runs Chrome in headless mode (no GUI).
driver = webdriver.Chrome(options=options)

results = []
page_num = 0
last_html = None

max_page = 13

while page_num <= max_page:
    URL = BASE_URL.format(page_num)
    driver.get(URL)

    # Wait for the page to fully load (adjust the timeout as needed)
    wait = WebDriverWait(driver, 10)
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.gallery-card')))

    
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')

    # Check if the page is empty (based on whether we find gallery cards)
    if not soup.find_all('div', class_='gallery-card'):
        break
        
    # page = soup.find_all('span', class_="cl-page-number")

    # Extracting car details
    for listing in soup.find_all('div', class_='gallery-card'):
        title = listing.find('span', class_='label').text if listing.find('span', class_='label') else "No title"
        price = listing.find('span', class_='priceinfo').text if listing.find('span', class_='priceinfo') else "No price"
        
        if price != "No price":
            results.append(f"Title: {title} | Price: {price}")

    
    # Move to the next page
    page_num += 1
    print(page_num)

print(len(results))
for result in results:
    print(result)

driver.quit()


1
2
3
4
5
6
7
8
9
10
11
12
13
14
1470
Title: 2015 Ford F-250 Super Duty | Price: $17,995
Title: 2006 Toyota Sequoia | Price: $8,995
Title: 2014 Honda Ridgeline 4x4 | Price: $14,495
Title: 2012 ford expedition el | Price: $8,500
Title: moles | Price: $10,000,999,999
Title: 2014 Infiniti Q60 Convertible IPL 2dr Convertible | Price: $29,990
Title: F-250 Lariat Extended Cab Diesel | Price: $10,000
Title: 1991 Corvette Convertible | Price: $13,950
Title: 1964 Ford Galaxie 500 Reduced to $23,950 | Price: $35,950
Title: 2015 TOYOTA CAMRY 4DR SDN I4 AUTO SE Text Offers/Trades 865-250-8927 | Price: $12,900
Title: 2016 TOYOTA TACOMA DOUBLE CAB TRD Offroad 4x4 Lets Trade Text Offers 865-250 | Price: $23,900
Title: 2017 Toyota Highlander Limited V6 AWD 3rd Row Loaded With Options Lets Trade | Price: $22,900
Title: 2019 Ford F250 LARIAT 4WD Crew Cab Clean Truck Lets Trade Text Offers 865-25 | Price: $39,900
Title: 2014 Toyota Highlander AWD 4dr V6 Limited Lets Trade Text Offers 865-250-892 | Price: