In [1]:
! pip install requests bs4 lxml scrapy selenium



In [2]:
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import time
import json
from selenium import webdriver
from concurrent.futures import ThreadPoolExecutor

In [3]:
def setup_driver():
    driver = webdriver.Firefox()
    return driver

In [4]:
def fetch_sitemap(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(5)
    response = driver.page_source
    driver.quit()
    return response


def parse_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    return [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]

In [5]:
def fetch_and_parse(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(10)
    content = driver.page_source
    driver.quit()
    return BeautifulSoup(content, 'html.parser')


def is_valid_url(url):
    try:
        result = urlparse(url)
        if result.path.lower().endswith(('.png', '.svg', '.gif')):
            return False

        if 'logo' in result.path.lower():
            return False

        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def extract_vin_and_images(soup):
    vin = None
    images = set()
    vin_pattern = re.compile(r'[A-HJ-NPR-Z0-9]{17}')
    vin = None
    for text in soup.stripped_strings:
        match = vin_pattern.search(text)
        if match:
            vin = match.group()
            break
    print(f"VIN: {vin}")
    for section in soup.find_all('section', id=re.compile(r'vdp-photos-dealershipPhotoGallery.*')):
        for img in section.find_all('img'):
            src = img.get('src')
            if src:
                if (is_valid_url(src) == False):
                    continue
                src = src.split('?')[0]
                images.add(src)

    pattern = re.compile(r'x\d+')
    images = [pattern.sub('', img) for img in images]
    print(f"Images: {images}")
    return vin, list(images)


def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


def process_url(url, website_name, data):
    soup = fetch_and_parse(url)

    vin, images = extract_vin_and_images(soup)

    if vin:
        data['vins'][vin] = {'scraped_images_url': images}

    with open(os.path.join(website_name, 'inventory_car_data.json'), 'w') as f:
        json.dump(data, f, indent=4)

In [6]:
def main(url):
    parsed_url = urlparse(url)
    website_name = parsed_url.netloc
    create_directory(website_name)
    sitemap_url = f'https://{website_name}/sitemap-inventory-sincro.xml'
    try:
        sitemap_content = fetch_sitemap(sitemap_url)
        all_urls = parse_sitemap(sitemap_content)

        inventory_urls = []
        for url in all_urls:
            url_path_components = urlparse(url).path.split('/')
            if (url_path_components[-1] != ''):
                if (len(url_path_components) > 2):
                    inventory_urls.append(url)
                elif (len(url_path_components) >= 2 and url_path_components[1].count('-') >= parsed_url.path.split("/")[1].count("-") and any(char.isdigit() for char in url_path_components[1])):
                    inventory_urls.append(url)

        with open(os.path.join(website_name, 'inventory_car_urls.txt'), 'w') as f:
            f.writelines(", \n".join(inventory_urls))

        data = {
            "website_name": website_name,
            'vins': {}
        }

        with ThreadPoolExecutor(max_workers=5) as executor:
            for url in inventory_urls:
                executor.submit(process_url, url, website_name, data)

    except Exception as e:
        print("An error occurred:", str(e))


if __name__ == "__main__":
    website_input_url = 'https://www.spurrchevrolet.com/VehicleDetails/new-2024-Chevrolet-Silverado_3500_HD-Crew_Cab_Standard_Box_4_Wheel_Drive_High_Country-BROCKPORT-NY/5803164470'
    main(website_input_url)

VIN: 3GNKBJR45RS227020
Images: ['https://inv.assets.ansira.net/5/7/6/33067454675.jpg', 'https://inv.assets.ansira.net/8/3/3/33094559338.jpg', 'https://inv.assets.ansira.net/3/8/6/33067454683.jpg', 'https://inv.assets.ansira.net/3/2/3/33094559323.jpg', 'https://inv.assets.ansira.net/9/2/3/33094559329.jpg', 'https://inv.assets.ansira.net/7/7/5/33027917577.jpg', 'https://inv.assets.ansira.net/5/3/3/33094559335.jpg', 'https://inv.assets.ansira.net/0/4/3/33094559340.jpg', 'https://inv.assets.ansira.net/9/7/6/33067454679.jpg', 'https://inv.assets.ansira.net/1/3/3/33094559331.jpg', 'https://inv.assets.ansira.net/9/7/5/33027917579.jpg', 'https://inv.assets.ansira.net/6/8/6/33067454686.jpg', 'https://inv.assets.ansira.net/0/7/6/33067454670.jpg', 'https://inv.assets.ansira.net/1/7/6/33067454671.jpg', 'https://inv.assets.ansira.net/5/8/6/33067454685.jpg', 'https://inv.assets.ansira.net/6/2/3/33094559326.jpg', 'https://inv.assets.ansira.net/2/4/3/33094559342.jpg', 'https://inv.assets.ansira.net/8/