In [1]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from tqdm import tqdm

def extract_car_links(listing_url, max_pages=1):
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_links = []

    for page in tqdm(range(1, max_pages + 1), desc="Extracting car links"):
        paginated_url = f"{listing_url}&page={page}"
        response = requests.get(paginated_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        link_tags = soup.select('a[data-test="cardLinkCover"]')
        for tag in link_tags:
            relative_link = tag.get("href")
            full_link = "https://www.truecar.com" + relative_link
            all_links.append({"url": full_link})

    # Save to JSON
    with open("./data/truecar_links.json", "w") as f:
        json.dump(all_links, f, indent=2)

    # Save to CSV
    pd.DataFrame(all_links).to_csv("./data/truecar_links.csv", index=False)

    print(f"✅ Saved {len(all_links)} links to truecar_links.json and truecar_links.csv")


In [2]:
def extract_car_details(link):
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(link, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    car_data = {'url': link}

    try:
        container = soup.select_one("div.row.pt-3")
        if container:
            details = container.select("div.flex.items-center")
            for detail in details:
                text = detail.get_text(separator=" ", strip=True)
                if ":" in text:
                    key, value = text.split(":", 1)
                    car_data[key.strip().lower().replace(" ", "_")] = value.strip()
                elif "VIN" in text:
                    car_data['vin'] = text.split("VIN:")[1].strip()
                elif "Stock Number" in text:
                    car_data['stock_number'] = text.split("Stock Number:")[1].strip()
                elif "miles" in text:
                    car_data['mileage'] = text.strip()
                elif "Listed" in text:
                    car_data['listed_since'] = text.strip()
        # Options & packages
        options_container = soup.find('h2', string="Options & packages")
        if options_container:
            options_list = []
            for item in options_container.find_next("div").find_all("div", class_="flex items-center"):
                text = item.get_text(strip=True)
                if text:
                    options_list.append(text)
            car_data['options_and_packages'] = ", ".join(options_list)

        # Popular Features
        popular_container = soup.find('h2', string="Popular features")
        if popular_container:
            features_list = []
            for item in popular_container.find_next("div").find_all("div", class_="flex items-center"):
                text = item.get_text(strip=True)
                if text:
                    features_list.append(text)
            car_data['popular_features'] = ", ".join(features_list)
        standard_container = soup.find('h2', string="Standard features")
        if standard_container:
            std_features_list = []
            # The features typically follow in divs with 'flex items-center' class
            for item in standard_container.find_next("div").find_all("div", class_="flex items-center"):
                text = item.get_text(strip=True)
                if text:
                    std_features_list.append(text)
            car_data['standard_features'] = ", ".join(std_features_list)
        price_section = soup.find('div', {'id': 'usedPriceGraph'})
        if price_section:
            # List and average price
            line_items = price_section.select('div[data-test="usedListingPriceGraphLineItem"]')
            for item in line_items:
                label = item.get("data-test-item")
                text = item.get_text(separator="|", strip=True)
                if label and "|" in text:
                    _, value = text.split("|")
                    key = label.lower().replace(" ", "_")
                    car_data[key] = value.strip()

            # Price Quality Bars (e.g., Excellent, Great, Fair, High)
            quality_bars = price_section.select('div[data-test="priceRangeIconAndRange"]')
            for bar in quality_bars:
                quality = bar.get("data-test-item")
                range_tag = bar.find("p")
                if quality and range_tag:
                    car_data[f"price_range_{quality.lower()}"] = range_tag.text.strip()

            # Summary sentence
            description = price_section.find('div', {'data-test': 'usedListingPriceGraphDescription'})
            if description:
                car_data["price_description"] = description.get_text(strip=True)
        seller_notes_header = soup.find('h2', string="Seller Notes")
        if seller_notes_header:
            seller_div = seller_notes_header.find_next('div', class_='see-more')
            if seller_div:
                notes = seller_div.get_text(separator=" ", strip=True)
                car_data['seller_notes'] = notes
    except Exception as e:
        print(f"❌ Error extracting from {link}: {e}")

    return car_data

def scrape_all_car_details(json_path="./data/truecar_links.json", output_json="./data/truecar_details.json", output_csv="./data/truecar_details.csv"):
    with open(json_path, "r") as f:
        car_links = json.load(f)

    results = []
    for entry in tqdm(car_links, desc="Scraping car details"):
        link = entry["url"]
        car_info = extract_car_details(link)
        results.append(car_info)

    # Save to JSON
    with open(output_json, "w") as f:
        json.dump(results, f, indent=2)

    # Save to CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)

    print(f"✅ Saved car details to {output_json} and {output_csv}")


In [3]:
listing_url = "https://www.truecar.com/used-cars-for-sale/listings/location-boston-ma/?stock_type=used&page_size=100"

extract_car_links(listing_url, max_pages=100)  # Creates JSON + CSV of URLs  
scrape_all_car_details()                     # Reads from JSON, creates JSON + CSV of details


Extracting car links: 100%|██████████| 100/100 [02:42<00:00,  1.63s/it]


✅ Saved 3159 links to truecar_links.json and truecar_links.csv


Scraping car details: 100%|██████████| 3159/3159 [1:02:19<00:00,  1.18s/it]


✅ Saved car details to ./data/truecar_details.json and ./data/truecar_details.csv
