In [1]:
! pip install requests bs4 lxml scrapy selenium



In [2]:
import os
import requests
import csv
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re

from lxml import html
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

In [3]:
def setup_driver():
    driver = webdriver.Firefox()
    return driver

In [4]:
def fetch_sitemap(url):
    driver = setup_driver()
    driver.get(url)
    time.sleep(20)
    page_source = driver.page_source
    driver.quit()
    return page_source


def parse_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    return [elem.text for elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')]

In [5]:
def fetch_and_parse(url, driver):
    driver.get(url)
    time.sleep(30)
    content = driver.page_source
    content_length = int(len(content) * 0.65)
    driver.close()
    return BeautifulSoup(content[:content_length], 'html.parser')


def is_valid_url(url):
    try:
        result = urlparse(url)
        if result.path.lower().endswith(('.png', '.svg', '.gif')):
            return False
        unrequired_keywords = ['logo', 'icon', 'spinner', 'loading', 'facebook']
        for keyword in unrequired_keywords:
            if keyword in result.path.lower():
                return False

        return all([result.scheme, result.netloc])
    except ValueError:
        return False


def get_highest_resolution_image(srcset):
    """Extract the highest resolution image URL from the srcset."""
    highest_res = 0
    highest_res_url = None
    for entry in srcset.split(","):
        parts = entry.strip().split(" ")
        if len(parts) == 2 and parts[1].endswith("w"):
            res = int(parts[1][:-1])  # Remove 'w' and convert to integer
            if res > highest_res:
                highest_res = res
                highest_res_url = parts[0]
    return highest_res_url


def extract_vin_and_images(soup, base_url, driver, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    vin = None
    images = set()
    VIN_X_PATH = None
    IMAGE_CONTAINER_X_PATH = None
    if VIN_X_PATH:
        vin = driver.find_element(by=By.XPATH, value=VIN_X_PATH).text
    else:
        vin_pattern = re.compile(r'[A-HJ-NPR-Z0-9]{17}')

        # Search the entire HTML text
        vin = None
        for text in soup.stripped_strings:
            match = vin_pattern.search(text)
            if match:
                vin = match.group()
                break
    print(f"VIN: {vin}")
    if IMAGE_CONTAINER_X_PATH:
        image_elements = driver.find_element(
            by=By.XPATH, value=IMAGE_CONTAINER_X_PATH).find_elements(By.TAG_NAME, 'img')
        for img in image_elements:
            if img.get_attribute('src'):
                img_url = img.get_attribute('src')
                img_url = img_url.split('?')[0]
                if img_url.startswith('/'):
                    img_url = base_url + img_url
                if is_valid_url(img_url):
                    images.add(img_url)
    else:
        for img in soup.find_all('img'):
            src_url = img.get('src') or img.get('data-src')
            src_url = src_url.split('?')[0]
            if src_url:
                if src_url.startswith('/'):
                    src_url = base_url + src_url
                if is_valid_url(src_url):
                    images.add(src_url)

            srcset = img.get('srcset')
            if srcset:
                highest_res_image = get_highest_resolution_image(srcset)
                if highest_res_image:
                    if highest_res_image.startswith('/'):
                        highest_res_image = base_url + highest_res_image
                    if is_valid_url(highest_res_image):
                        images.add(highest_res_image)

    print(f"Images: {images}")
    return vin, list(images)

# def extract_vin_and_images(soup):
#     # Extract VIN
#     vin = None

#     if not vin:
#         vin_pattern = re.compile(r'[A-HJ-NPR-Z0-9]{17}')
#         for script in soup.find_all('script'):
#             if script.string:
#                 match = vin_pattern.search(script.string)
#                 if match:
#                     vin = match.group()
#                     break
#     print(f"VIN: {vin}")

#     # Extract images
#     images = set()

#     for img in soup.find_all('img'):
#         if img.has_attr('src'):
#             img_url = img['src']
#             if is_valid_url(img_url):
#                 images.add(img_url)
#     print(f"Images: {images}")
#     return vin, ', '.join(images)


def create_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)


# def download_image(image_url, save_path):
#     """Download an image only if it's not already downloaded."""
#     headers = {
#         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
#     try:
#         response = requests.get(image_url, stream=True,
#                                 headers=headers, timeout=10)
#         if response.status_code == 200:
#             with open(save_path, 'wb') as out_file:
#                 out_file.write(response.content)
#             # print(f"Downloaded {image_url} to {save_path}")
#         else:
#             # print(
#             #     f"Failed to download {image_url} - Status code: {response.status_code}")
#             pass
#     except requests.exceptions.RequestException as e:
#         print(f"An error occurred while downloading {image_url}: {str(e)}")


# def normalize_filenames(files):
#     """Strip numerical prefixes from filenames to handle existing checks."""
#     normalized = set()
#     # Matches any numbers followed by an underscore at the start
#     pattern = re.compile(r"^\d+_")
#     for file in files:
#         normalized_name = pattern.sub('', file)  # Remove the prefix
#         normalized.add(normalized_name)
#     return normalized


def process_url(url, parsed_url, website_name, data, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    driver = setup_driver()
    soup = fetch_and_parse(url, driver)
    vin, images = extract_vin_and_images(
        soup,
        base_url=f"{parsed_url.scheme}://{parsed_url.netloc}",
        driver=driver
    )

    if vin:
        data['vins'][vin] = {'scraped_images_url': images}

    with open(os.path.join(website_name, 'inventory_car_data.json'), 'w') as f:
        json.dump(data, f, indent=4)

In [6]:
def main(url, VIN_X_PATH=None, IMAGE_CONTAINER_X_PATH=None):
    parsed_url = urlparse(url)
    website_name = parsed_url.netloc
    create_directory(website_name)
    sitemap_url = f'https://{website_name}/sitemap.xml'
    try:
        sitemap_content = fetch_sitemap(sitemap_url)
        all_urls = parse_sitemap(sitemap_content)

        inventory_urls = []
        for url in all_urls:
            url_path_components = urlparse(url).path.split('/')
            if (url_path_components[-1] != ''):
                if (len(url_path_components) > 2):
                    inventory_urls.append(url)
                elif (len(url_path_components) >= 2 and url_path_components[1].count('-') >= parsed_url.path.split("/")[1].count("-") and any(char.isdigit() for char in url_path_components[1])):
                    inventory_urls.append(url)

        with open(os.path.join(website_name, 'inventory_car_urls.txt'), 'w') as f:
            f.writelines(", \n".join(inventory_urls))

        data = {
            website_name: website_name,
            'vins': {}
        }

        with ThreadPoolExecutor(max_workers=5) as executor:
            future_to_url = {executor.submit(process_url, url, parsed_url, website_name,
                                             data, VIN_X_PATH, IMAGE_CONTAINER_X_PATH): url for url in inventory_urls}

            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    future.result()

                except Exception as exc:
                    print(f"{url} generated an exception: {exc}")

    except Exception as e:
        print("An error occurred:", str(e))


if __name__ == "__main__":
    # VIN_X_PATH = """//*[@id="dealr-page"]/div/div/div/div/div/div/div[4]/div/div/div/div/div[1]/div[2]"""
    # IMAGE_CONTAINER_X_PATH = """//*[@id="dealr-page"]/div/div/div/div/div/div/div[3]/div/div[1]"""

    website_input_url = 'https://www.johnsonsdalby.com.au/used-cars/for-sale/nissan/d23/2019/sl/navara-4x4-2.3-dsl-man-dc-sl/2924472'
    main(website_input_url)

VIN: MMAJNKB40BD016288
Images: {'https://imotorcarsearch.s3.amazonaws.com/vehicles/medium/2624981_1001148_3_ea47bf8880373f5a.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/medium/2624981_1001148_5_2e2319db0771c1e2.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/small/2624981_1001148_10_f48cd258eee98738.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/small/2624981_1001148_1_04a51d8d03a3ff6e.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/medium/2624981_1001148_10_f48cd258eee98738.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/medium/2624981_1001148_7_072197fe969b9944.jpg', 'https://api.mapbox.com/styles/v1/mapbox/streets-v11/static/url-https%3A%2F%2Fwww.johnsonsdalby.com.au%2Fcarsearch_v3%2Fimages%2Fmap-marker.png(151.2620192,-27.1826758)/151.2620192,-27.1826758,13/340x200@2x', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/small/2624981_1001148_12_eb59f59996146ef3.jpg', 'https://imotorcarsearch.s3.amazonaws.com/vehicles/medium/2624981_