In [8]:
! pip install requests bs4 lxml



In [9]:

import os
import requests
import csv
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re

In [10]:


def fetch_sitemap(url):
    response = requests.get(url)
    response.raise_for_status()
    return response.text


def parse_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    return [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]


def filter_urls(urls, pattern):
    return [url for url in urls if pattern in url]


def fetch_and_parse(url):
    response = requests.get(url)
    response.raise_for_status()
    return BeautifulSoup(response.text, 'html.parser')


def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def extract_vin_and_images(soup):
    
    vin = soup.find('input', {'name': 'vin'})['value'] if soup.find(
        'input', {'name': 'vin'}) else 'No VIN found'
    images = [img['src'] for img in soup.select(
        '#media1-app-root img[src]') if is_valid_url(img['src'])]
    return vin, ', '.join(images)


def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


def download_image(image_url, save_path):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    try:
        response = requests.get(image_url, stream=True,
                                headers=headers, timeout=10)
        if response.status_code == 200:
            with open(save_path, 'wb') as out_file:
                out_file.write(response.content)
            # print(f"Downloaded {image_url} to {save_path}")
        else:
            # print(
            #     f"Failed to download {image_url} - Status code: {response.status_code}")
            pass
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while downloading {image_url}: {str(e)}")


def main(website_url):
    parsed_url = urlparse(website_url)
    website_name = parsed_url.netloc
    base_directory = create_directory(website_name)
    sitemap_url = f'https://{website_name}/sitemap.xml'
    try:
        sitemap_content = fetch_sitemap(sitemap_url)
        all_urls = parse_sitemap(sitemap_content)
        used_car_urls = filter_urls(all_urls, 'used/')
        new_car_urls = filter_urls(all_urls, 'new/')

        # Save filtered URLs
        with open(os.path.join(website_name, 'used_car_urls.txt'), 'w') as f:
            f.writelines("\n".join(used_car_urls))
        with open(os.path.join(website_name, 'new_car_urls.txt'), 'w') as f:
            f.writelines("\n".join(new_car_urls))

        # Process each used car URL for images and VIN
        data = []
        for url in used_car_urls:
            soup = fetch_and_parse(url)
            vin, images = extract_vin_and_images(soup)
            if images:  # Ensure there are valid images
                # Adjust path for data subfolder
                vin_directory = os.path.join(website_name, 'data', vin)
                create_directory(vin_directory)
                for index, image_url in enumerate(images.split(', ')):
                    image_name = f"{index}_{os.path.basename(urlparse(image_url).path)}"
                    save_path = os.path.join(vin_directory, image_name)
                    download_image(image_url, save_path)
                data.append((vin, images))

            # Save data to CSV
            if data:
                with open(os.path.join(website_name, 'vin_image_data.csv'), 'w', newline='', encoding='utf-8') as file:
                    writer = csv.writer(file)
                    writer.writerow(['VIN', 'Image URLs'])
                    for vin, images in data:
                        writer.writerow([vin, images])
    except Exception as e:
        print("An error occurred:", str(e))


if __name__ == "__main__":
    website_input_url = 'https://www.sewelllexus-dallas.com/'
    main(website_input_url)

VIN found: B1BGU1MMHF5WRB9VW
VIN found: JTHKD5BH0F2216858
VIN found: 58ADA1C17RU044659
VIN found: 58ADA1C19RU045764
An error occurred: [Errno 2] No such file or directory: 'www.sewelllexus-dallas.com\\data\\58ADA1C19RU045764\\10_0fd4ee08ca74b5ea25b2c79ccc564ef4x.jpg'
