In [36]:
! pip install requests bs4 lxml scrapy selenium



In [37]:
import os
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import requests
import json

In [38]:
def fetch_sitemap(url):
    response = requests.get(url)
    return response.text


def parse_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    return [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]

In [39]:
def fetch_and_parse(url):
    response = requests.get(url)
    content = response.text
    return BeautifulSoup(content, 'html.parser')


def is_valid_url(url):
    try:
        result = urlparse(url)
        if result.path.lower().endswith(('.png', '.svg', '.gif')):
            return False

        if 'logo' in result.path.lower():
            return False

        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def extract_vin_and_images(soup):
    vin = None
    images = set()
    vin_pattern = re.compile(r'[A-HJ-NPR-Z0-9]{17}')
    vin = None
    for text in soup.stripped_strings:
        match = vin_pattern.search(text)
        if match:
            vin = match.group()
            break
    print(f"VIN: {vin}")
    for img in soup.find_all('img'):
        src = img.get('data-src')
        if src and is_valid_url(src):
            images.add(src)
    print(f"Images: {images}")
    return vin, list(images)


def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


def process_url(url, website_name, data):
    soup = fetch_and_parse(url)
    vin, images = extract_vin_and_images(soup)

    if vin:
        data['vins'][vin] = {'scraped_images_url': images}

    with open(os.path.join(website_name, 'inventory_car_data.json'), 'w') as f:
        json.dump(data, f, indent=4)

In [40]:
def main(url):
    parsed_url = urlparse(url)
    website_name = parsed_url.netloc
    create_directory(website_name)
    sitemap_url = f'https://{website_name}/sitemap.xml'
    try:
        sitemap_content = fetch_sitemap(sitemap_url)
        all_urls = parse_sitemap(sitemap_content)

        inventory_urls = []
        for url in all_urls:
            url_path_components = urlparse(url).path.split('/')
            if (url_path_components[-1] != ''):
                if (len(url_path_components) > 2):
                    inventory_urls.append(url)
                elif (len(url_path_components) >= 2 and url_path_components[1].count('-') >= parsed_url.path.split("/")[1].count("-") and any(char.isdigit() for char in url_path_components[1])):
                    inventory_urls.append(url)

        with open(os.path.join(website_name, 'inventory_car_urls.txt'), 'w') as f:
            f.writelines(", \n".join(inventory_urls))

        data = {
            "website_name": website_name,
            'vins': {}
        }

        for (url) in inventory_urls:
            process_url(url, website_name, data)

    except Exception as e:
        print("An error occurred:", str(e))


if __name__ == "__main__":
    website_input_url = 'https://almva.com/'
    main(website_input_url)

VIN: SALRU2RV5L2442179
Images: {'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918822529720.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918752200279.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918777013801.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918840141515.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918833340556.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918758796049.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918909789660.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918767853247.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918787862657.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918842535660.jpg', 'https://imagescdn.dealercarsearch.com/Media/15119/21117921/638475918901542639.jpg', 'https://imagescdn.dealercarsearc