In [1]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import os
from tqdm import tqdm

In [2]:
def extract_car_details(link):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    car_data = {'url': link}

    try:
        container = soup.select_one("div.row.pt-3")
        if container:
            details = container.select("div.flex.items-center")
            for detail in details:
                text = detail.get_text(separator=" ", strip=True)
                if ":" in text:
                    key, value = text.split(":", 1)
                    car_data[key.strip().lower().replace(" ", "_")] = value.strip()
                elif "VIN" in text:
                    car_data['vin'] = text.split("VIN:")[1].strip()
                elif "Stock Number" in text:
                    car_data['stock_number'] = text.split("Stock Number:")[1].strip()
                elif "miles" in text:
                    car_data['mileage'] = text.strip()
                elif "Listed" in text:
                    car_data['listed_since'] = text.strip()

        options_container = soup.find('h2', string="Options & packages")
        if options_container:
            options_list = [
                item.get_text(strip=True)
                for item in options_container.find_next("div").find_all("div", class_="flex items-center")
            ]
            car_data['options_and_packages'] = ", ".join(options_list)

        popular_container = soup.find('h2', string="Popular features")
        if popular_container:
            features_list = [
                item.get_text(strip=True)
                for item in popular_container.find_next("div").find_all("div", class_="flex items-center")
            ]
            car_data['popular_features'] = ", ".join(features_list)

        standard_container = soup.find('h2', string="Standard features")
        if standard_container:
            std_features_list = [
                item.get_text(strip=True)
                for item in standard_container.find_next("div").find_all("div", class_="flex items-center")
            ]
            car_data['standard_features'] = ", ".join(std_features_list)

        price_section = soup.find('div', {'id': 'usedPriceGraph'})
        if price_section:
            line_items = price_section.select('div[data-test="usedListingPriceGraphLineItem"]')
            for item in line_items:
                label = item.get("data-test-item")
                text = item.get_text(separator="|", strip=True)
                if label and "|" in text:
                    _, value = text.split("|")
                    car_data[label.lower().replace(" ", "_")] = value.strip()

            quality_bars = price_section.select('div[data-test="priceRangeIconAndRange"]')
            for bar in quality_bars:
                quality = bar.get("data-test-item")
                range_tag = bar.find("p")
                if quality and range_tag:
                    car_data[f"price_range_{quality.lower()}"] = range_tag.text.strip()

            description = price_section.find('div', {'data-test': 'usedListingPriceGraphDescription'})
            if description:
                car_data["price_description"] = description.get_text(strip=True)

        seller_notes_header = soup.find('h2', string="Seller Notes")
        if seller_notes_header:
            seller_div = seller_notes_header.find_next('div', class_='see-more')
            if seller_div:
                car_data['seller_notes'] = seller_div.get_text(separator=" ", strip=True)

    except Exception as e:
        print(f"❌ Error extracting from {link}: {e}")

    return car_data


In [3]:
def scrape_all_car_details(json_path, output_json, output_csv):
    with open(json_path, "r") as f:
        car_links = json.load(f)

    results = []
    for entry in tqdm(car_links, desc="Scraping car details"):
        link = entry["url"]
        car_info = extract_car_details(link)
        results.append(car_info)

    with open(output_json, "w") as f:
        json.dump(results, f, indent=2)

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved car details to:\n- {output_json}\n- {output_csv}")


def scrape_city_state(city, state, max_pages=2):
    city_slug = city.lower().replace(" ", "-")
    state_slug = state.lower()
    location_key = f"{city_slug}_{state_slug}"

    all_links = []
    os.makedirs("./data", exist_ok=True)

    for i in tqdm(range(1, max_pages + 1), desc=f"{location_key} - Page"):
        url = f"https://www.truecar.com/used-cars-for-sale/listings/location-{city_slug}-{state_slug}/?page={i}"
        response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        soup = BeautifulSoup(response.text, 'html.parser')
        link_tags = soup.select('a[data-test="cardLinkCover"]')

        for tag in link_tags:
            relative_link = tag.get("href")
            if relative_link:
                all_links.append({"url": "https://www.truecar.com" + relative_link})

    json_path = f"./data/truecar_links_{location_key}.json"
    csv_path = f"./data/truecar_links_{location_key}.csv"
    output_json = f"./data/truecar_details_{location_key}.json"
    output_csv = f"./data/truecar_details_{location_key}.csv"

    with open(json_path, "w") as f:
        json.dump(all_links, f, indent=2)
    pd.DataFrame(all_links).to_csv(csv_path, index=False)

    print(f"✅ Saved {len(all_links)} links for {city.title()}, {state.upper()}")
    scrape_all_car_details(json_path=json_path, output_json=output_json, output_csv=output_csv)

In [None]:
if __name__ == "__main__":
    cities = [("Boston", "MA"), ("Austin", "TX"), ("San Francisco", "CA")]
    for city, state in cities:
        scrape_city_state(city, state, max_pages=200)


boston_ma - Page: 100%|██████████| 200/200 [06:00<00:00,  1.80s/it]


✅ Saved 6459 links for Boston, MA


Scraping car details: 100%|██████████| 6459/6459 [2:10:36<00:00,  1.21s/it]  


✅ Saved car details to:
- ./data/truecar_details_boston_ma.json
- ./data/truecar_details_boston_ma.csv


austin_tx - Page: 100%|██████████| 200/200 [05:59<00:00,  1.80s/it]


✅ Saved 6459 links for Austin, TX


Scraping car details: 100%|██████████| 6459/6459 [2:09:25<00:00,  1.20s/it]  


✅ Saved car details to:
- ./data/truecar_details_austin_tx.json
- ./data/truecar_details_austin_tx.csv


san-francisco_ca - Page: 100%|██████████| 200/200 [05:42<00:00,  1.71s/it]


✅ Saved 6459 links for San Francisco, CA


Scraping car details:  36%|███▋      | 2353/6459 [48:02<1:24:09,  1.23s/it]