In [None]:
! pip install requests bs4 lxml scrapy selenium



In [None]:
import os
import requests
import csv
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re

from lxml import html
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

In [None]:
def setup_driver():
    driver = webdriver.Firefox()
    return driver

In [None]:
def fetch_sitemap(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(20)
    page_source = driver.page_source
    driver.quit()
    return page_source


def parse_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    return [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]

In [None]:
def fetch_and_parse(url, driver):
    driver.get(url)
    content = driver.page_source
    content_length = int(len(content) * 0.65)
    return BeautifulSoup(content[:content_length], 'html.parser')


def is_valid_url(url):
    try:
        result = urlparse(url)
        if result.path.lower().endswith(('.png', '.svg', '.gif')):
            return False

        if 'logo' in result.path.lower():
            return False

        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def get_highest_resolution_image(srcset):
    """Extract the highest resolution image URL from the srcset."""
    highest_res = 0
    highest_res_url = None
    for entry in srcset.split(","):
        parts = entry.strip().split(" ")
        if len(parts) == 2 and parts[1].endswith("w"):
            res = int(parts[1][:-1])  # Remove 'w' and convert to integer
            if res > highest_res:
                highest_res = res
                highest_res_url = parts[0]
    return highest_res_url


def extract_vin_and_images(soup, base_url, driver, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    vin = None
    images = set()
    vin_pattern = re.compile(r'[A-HJ-NPR-Z0-9]{17}')
    vin = None
    for text in soup.stripped_strings:
        match = vin_pattern.search(text)
        if match:
            vin = match.group()
            break
    print(f"VIN: {vin}")

    for img in soup.find_all('img'):
        src_url = img.get('src') or img.get('data-src')
        if src_url:
            if src_url.startswith('/'):
                src_url = base_url + src_url
            if is_valid_url(src_url):
                images.add(src_url)

        srcset = img.get('srcset')
        if srcset:
            highest_res_image = get_highest_resolution_image(srcset)
            if highest_res_image:
                if highest_res_image.startswith('/'):
                    highest_res_image = base_url + highest_res_image
                if is_valid_url(highest_res_image):
                    images.add(highest_res_image)

        # Return the highest resolution images
        highest_res_images = list(images)
    print(f"Images: {highest_res_images}")
    return vin, ', '.join(highest_res_images)

def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

def process_url(url, parsed_url, website_name, data, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    driver = setup_driver()
    soup = fetch_and_parse(url, driver)
    vin, images = extract_vin_and_images(
        soup,
        base_url=f"{parsed_url.scheme}://{parsed_url.netloc}",
        driver=driver
    )

    if vin:
        data['vins'][vin] = {'scraped_images_url': images}

    with open(os.path.join(website_name, 'inventory_car_data.json'), 'w') as f:
        json.dump(data, f, indent=4)

In [None]:
def main(url, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    parsed_url = urlparse(url)
    website_name = parsed_url.netloc
    create_directory(website_name)
    sitemap_url = f'https://{website_name}/sitemap.xml'
    try:
        sitemap_content = fetch_sitemap(sitemap_url)
        all_urls = parse_sitemap(sitemap_content)

        inventory_urls = []
        for url in all_urls:
            url_path_components = urlparse(url).path.split('/')
            if (url_path_components[-1] != ''):
                if (len(url_path_components) > 2):
                    inventory_urls.append(url)
                elif (len(url_path_components) >= 2 and url_path_components[1].count('-') >= parsed_url.path.split("/")[1].count("-") and any(char.isdigit() for char in url_path_components[1])):
                    inventory_urls.append(url)

        with open(os.path.join(website_name, 'inventory_car_urls.txt'), 'w') as f:
            f.writelines(", \n".join(inventory_urls))

        data = {
            website_name: website_name,
            'vins': {}
        }

        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {executor.submit(process_url, url, parsed_url, website_name,
                                             data, VIN_X_PATH, IMAGE_CONTAINER_X_PATH): url for url in inventory_urls}

            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    future.result()

                except Exception as exc:
                    print(f"{url} generated an exception: {exc}")

    except Exception as e:
        print("An error occurred:", str(e))


if __name__ == "__main__":
    # VIN_X_PATH = """//*[@id="dealr-page"]/div/div/div/div/div/div/div[4]/div/div/div/div/div[1]/div[2]"""
    # IMAGE_CONTAINER_X_PATH = """//*[@id="dealr-page"]/div/div/div/div/div/div/div[3]/div/div[1]"""

    website_input_url = 'https://www.extremecarcenter.com/details/used-2012-buick-verano/98039731'
    main(website_input_url)