In [1]:
import os
import requests
from bs4 import BeautifulSoup
import concurrent.futures
import pandas as pd
from tqdm import tqdm

class CarsScraper:
    def __init__(self, base_url, categories, max_pages, zip_code='02140'):
        self.base_url = base_url
        self.categories = categories
        self.max_pages = max_pages
        self.zip_code = zip_code
        self.headers = {'User-Agent': 'Mozilla/5.0'}

    def send_request(self, url):
        try:
            response = requests.get(url, headers=self.headers)
            if response.status_code == 200:
                print(f"[200 OK] {url}")
            else:
                print(f"[{response.status_code}] Failed to fetch {url}")
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            print(f"[ERROR] {url} -> {e}")
            return None

    def parse_page(self, response):
        if response and response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        return None

    def extract_car_info(self, car_element, category):
        car_info = {'category': category}
        try:
            car_info['model_year'] = car_element.find('h2', class_='title').text.strip()
            mileage_elem = car_element.find('div', class_='mileage')
            car_info['mileage'] = mileage_elem.text.strip() if mileage_elem else None
            price_elem = car_element.find('span', class_='primary-price')
            car_info['price'] = price_elem.text.strip() if price_elem else None
        except AttributeError as e:
            print(f"Extraction error: {e}")
        return car_info

    def scrape_category(self, category, max_pages=None):
        max_pages = max_pages or self.max_pages.get(category, 1)
        car_info_list = []

        print(f"\n[SCRAPING] Category: {category} | Pages: {max_pages}")
        
        for page in range(1, max_pages + 1):
            url = (
                f"https://www.cars.com/shopping/results/"
                f"?dealer_id=&include_shippable=true&keyword="
                f"&list_price_max=&list_price_min="
                f"&makes[]={category}"
                f"&maximum_distance=10&mileage_max=&monthly_payment="
                f"&page={page}&page_size=100"
                f"&sort=best_match_desc"
                f"&stock_type=used"
                f"&year_max=&year_min="
                f"&zip={self.zip_code}"
            )

            response = self.send_request(url)
            soup = self.parse_page(response)

            if soup:
                car_elements = soup.find_all('div', class_='vehicle-details')
                
                # Progress bar for number of car elements found
                for car_element in tqdm(car_elements, desc=f"{category} (cars)", leave=True):
                    car_info = self.extract_car_info(car_element, category)
                    car_info_list.append(car_info)

        return car_info_list


    def save_data(self, data, output_dir, category):
        os.makedirs(output_dir, exist_ok=True)
        df = pd.DataFrame(data)
        csv_path = os.path.join(output_dir, f"{category}_used_cars.csv")
        json_path = os.path.join(output_dir, f"{category}_used_cars.json")
        df.to_csv(csv_path, index=False)
        df.to_json(json_path, orient="records", indent=2)
        print(f"Saved CSV and JSON for {category}")

    def scrape_all(self, output_dir="output"):
        for category in tqdm(self.categories, desc="All Categories", position=0):
            data = self.scrape_category(category)
            self.save_data(data, output_dir, category)


In [None]:
url = "https://www.cars.com/shopping/results/?dealer_id=&include_shippable=true&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=10&mileage_max=&monthly_payment=&page=1&page_size=100&sort=best_match_desc&stock_type=used&year_max=&year_min=&zip=02140"

categories = ['bmw']
max_pages = {'bmw': 1}
scraper = CarsScraper(base_url="", categories=categories, max_pages=max_pages, zip_code='02140')
scraper.scrape_all(output_dir="used_car_data_02140")


All Categories:   0%|          | 0/1 [00:00<?, ?it/s]


[SCRAPING] Category: bmw | Pages: 1
