In [1]:
src = 'https://www.cars.com/shopping/results/?dealer_id=&include_shippable=false&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&monthly_payment=&page_size=20&sort=best_match_desc&stock_type=all&year_max=&year_min=&zip='

In [2]:
main_part_src = 'https://www.cars.com/shopping/results/?dealer_id=&include_shippable=false&keyword=&list_price_max=&list_price_min=&makes[]=&maximum_distance=all&mileage_max=&monthly_payment='

In [3]:
second_part_src = '&page_size=20&sort=best_match_desc&stock_type=all&year_max=&year_min=&zip='

In [4]:
mid_part_src_static = '&page='

In [6]:
import csv
import json
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime
import time

# Initialize the webdriver using webdriver-manager for compatibility
driver = webdriver.Chrome()
# Open the Bina.az website
driver.get(src)  # Replace with the actual URL

# Create a CSV file to store the extracted data
with open('vehicle_data.csv', mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = [
        'Page Number Max', 'Current Page Number', 'Source Page Link', 'Advertisement Link',
        'Timestamp', 'New or Used', 'Title', 'Mileage', 'Price', 'Basics', 'Features'
    ]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    # Step 3: Locate all page links within the pagination container
    pagination_links = driver.find_elements(By.XPATH, "//spark-pagination//a[contains(@id, 'pagination-direct-link-')]")

    # Step 4: Extract the page numbers from the href or text
    max_page = 1
    for link in pagination_links:
        try:
            page_number = int(link.get_attribute("phx-value-page"))
            if page_number > max_page:
                max_page = page_number
        except ValueError:
            continue  # Skip if the value is not an integer

    # Step 5: Print the max page number
    print(f"Max page number: {max_page}")

    try:
        # Iterate through each page
        for current_page in range(1, max_page + 1):

            if (current_page % 10) == 0:
                # Close the browser
                driver.quit()
                time.sleep(10)
                driver = webdriver.Chrome()
            
            # Load current page
            pg_number = main_part_src + mid_part_src_static + str(current_page) + second_part_src
            src = pg_number
            print(f'\n\n Page Number is {current_page} \n')
            driver.get(src)  # Replace with actual page URL

            # Step 3: Find all vehicle cards
            vehicle_cards = WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, "vehicle-card"))
            )
    
            # Step 4: Extract all links from vehicle cards
            vehicle_links = []
            for card in vehicle_cards:
                try:
                    # Extract the <a> element inside each vehicle card
                    link_element = card.find_element(By.TAG_NAME, "a")
                    link = link_element.get_attribute("href")
                    if link:
                        vehicle_links.append(link)
                except:
                    continue

            # Step 5: Visit each vehicle link to extract details
            for link in vehicle_links:
                driver.get(link)
                timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                
                # Extract "New or Used" status
                try:
                    new_used_element = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "new-used"))
                    )
                    new_used = new_used_element.text
                except:
                    new_used = "N/A"
                    
                # Extract the listing title
                try:
                    listing_title_element = driver.find_element(By.CLASS_NAME, "listing-title")
                    listing_title = listing_title_element.text
                except:
                    listing_title = "N/A"
                
                # Extract the mileage
                try:
                    listing_mileage_element = driver.find_element(By.CLASS_NAME, "listing-mileage")
                    listing_mileage = listing_mileage_element.text
                except:
                    listing_mileage = "N/A"

                # Extract the price
                try:
                    # Step 1: Extract the entire HTML of the price-section
                    price_section_element = driver.find_element(By.CLASS_NAME, "price-section")
                    price_section_html = price_section_element.get_attribute('outerHTML')
                    
                    # Step 2: Use regex to find the price in the HTML
                    price_pattern = r'class="primary-price">(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)</span>'
                    match = re.search(price_pattern, price_section_html)
                    
                    if match:
                        price = match.group(1)
                    else:
                        price = "N/A"
                except:
                    price = "N/A"

                # Extract basics
                try:
                    basics_section_html = driver.find_element(By.CLASS_NAME, "basics-section").get_attribute('outerHTML')
                    dt_dd_pairs = re.findall(r'<dt>(.*?)</dt>\s*<dd>(.*?)</dd>', basics_section_html, re.DOTALL)
                    basics_data = {}
                    for dt, dd in dt_dd_pairs:
                        key = dt.strip()
                        value = dd.strip()
                        if key not in ["MPG", "VIN", "Stock #"]:
                            basics_data[key] = value
                    basics_data_json = json.dumps(basics_data, indent=4)
                except:
                    basics_data_json = "N/A"

                # Extract features
                try:
                    features_section_html = driver.find_element(By.CLASS_NAME, "features-section").get_attribute("outerHTML")
                    feature_data = {}
                    dt_dd_pairs = re.findall(r'<dt>(.*?)<\/dt>\s*<dd>\s*<ul class="vehicle-features-list">(.*?)<\/ul>\s*<\/dd>', features_section_html, re.DOTALL)
                    
                    for dt, dd_list in dt_dd_pairs:
                        dt = dt.strip()
                        items = re.findall(r'<li>(.*?)<\/li>', dd_list)
                        feature_data[dt] = items
                    
                    try:
                        all_features_section_html = driver.find_element(By.CLASS_NAME, "all_features-section").get_attribute("outerHTML")
                        all_features = re.findall(r'<div class="all-features-item">(.*?)<\/div>', all_features_section_html)
                        feature_data["All Features"] = all_features
                    except:
                        pass
                    features_json = json.dumps(feature_data, indent=4)
                except:
                    features_json = "N/A"

                # Write the extracted data to CSV
                writer.writerow({
                    'Page Number Max': max_page,
                    'Current Page Number': current_page,
                    'Source Page Link': src,  # Replace with the actual source page URL
                    'Advertisement Link': link,
                    'Timestamp': timestamp,
                    'New or Used': new_used,
                    'Title': listing_title,
                    'Mileage': listing_mileage,
                    'Price': price,
                    'Basics': basics_data_json,
                    'Features': features_json
                })
    except Exception as e:
        print(f"An error occurred: {e}")

# Close the browser
driver.quit()


Max page number: 500


 Page Number is 2 



 Page Number is 3 



 Page Number is 4 



 Page Number is 5 



 Page Number is 6 



 Page Number is 7 



 Page Number is 8 



 Page Number is 9 



 Page Number is 10 



 Page Number is 11 



 Page Number is 12 



 Page Number is 13 



 Page Number is 14 



 Page Number is 15 



 Page Number is 16 



 Page Number is 17 



 Page Number is 18 



 Page Number is 19 



 Page Number is 20 



 Page Number is 21 



 Page Number is 22 



 Page Number is 23 



 Page Number is 24 



 Page Number is 25 



 Page Number is 26 



 Page Number is 27 



 Page Number is 28 



 Page Number is 29 



 Page Number is 30 



 Page Number is 31 



 Page Number is 32 



 Page Number is 33 



 Page Number is 34 



 Page Number is 35 



 Page Number is 36 



 Page Number is 37 



 Page Number is 38 



 Page Number is 39 



 Page Number is 40 



 Page Number is 41 



 Page Number is 42 



 Page Number is 43 



 Page Number is 44 