In [110]:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from keras import layers, models
from PIL import Image
import numpy as np
import os
import random
import tensorflow as tf

IMAGE_SIZE = (48, 48)
CONTRAST_THRESHOLD = 4.5
OUTPUT_DIR = "scraped_images"
SCREENSHOT_PATH = "full_page_screenshot.png"
FINAL_SCREENSHOT_PATH = "final_screenshot.png"

chrome_driver_path = '/opt/homebrew/bin/chromedriver'
targetElements = ['h1']

def __init__(self, model=None):
    self.model = model
    self.driver = self._initialize_webdriver()
    self.selector_map = {}
    self.X = []
    self.y = []

def _initialize_webdriver(self):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
    
    return driver

def find_unique_xpath(self, element, soup):
    parts = []
    current = element

    while current and current.name != '[document]':  # Traverse up to the root
        if current.has_attr('id'):
            parts.insert(0, f"//{current.name}[@id='{current['id']}']")
            break  # IDs are unique; no need to go further
        
        if current.has_attr('class'):
            class_attr = ' '.join(current['class'])  # Classes are space-separated
            siblings = current.find_previous_siblings(name=current.name, class_=class_attr)
            index = len(siblings) + 1  # XPath indices are 1-based
            parts.insert(0, f"{current.name}[contains(@class, '{class_attr}')][{index}]")        
        else:
            siblings = current.find_previous_siblings(name=current.name)
            index = len(siblings) + 1  # XPath indices are 1-based
            parts.insert(0, f"{current.name}[{index}]")

        current = current.parent  # Move to the parent element

    # Combine all parts into a full XPath
    xpath = "/" + "/".join(parts)
    
    return xpath.replace("///", "//")

def calculate_contrast(self, image):
    """Calculate contrast ratio using WCAG guidelines."""
    grayscale_image = image.convert('L')
    pixel_values = np.array(grayscale_image)
    min_intensity = pixel_values.min()
    max_intensity = pixel_values.max()
    
    l1 = (max_intensity + 0.05) / 255
    l2 = (min_intensity + 0.05) / 255
    contrast_ratio = (l1 + 0.05) / (l2 + 0.05) if l1 > l2 else (l2 + 0.05) / (l1 + 0.05)
    
    return contrast_ratio

# def scrape_links(self, url, visited, max_pages=MAX_PAGES):
#     """Crawl the web starting from a URL, returning a list of links."""
#     to_visit = urls
#     links = []

#     while to_visit and len(links) < max_pages:
#         current_url = to_visit.pop(0)
        
#         if current_url in visited:
#             continue
#         try:
#             response = requests.get(url, timeout=3)
            
#             if response.status_code == 200:
#                 soup = BeautifulSoup(self.driver.page_source, "html.parser")
#                 visited.add(current_url)
#                 links.append(current_url)
#                 # Add new links to the queue                    
#                 for anchor in soup.find_all("a", href=True):
#                     href = anchor["href"]
#                     if href.startswith("http"):
#                         to_visit.append(href)
#         except Exception as e:
#             print(f"Failed to process {current_url}: {e}")
#     return links

def screenshot_element(self, element):    
    """Takes a screenshot of the target element and saves it."""
    element_location = element.location
    element_size = element.size
    screenshot = Image.open(SCREENSHOT_PATH)
    screenshot_width, screenshot_height = screenshot.size
    
    element_left = element_location['x']
    element_top = element_location['y']
    element_right = min(screenshot_width, element_left + element_size['width']) 
    element_bottom = min(element_top + element_size['height'], screenshot_height)
    
    # Crop the image using Pillow
    cropped_image = screenshot.crop((element_left, element_top, element_right, element_bottom))

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    
    element_screenshot_path = os.path.join(OUTPUT_DIR, f"screenshot_{random.randint(1, 1000000)}.png")
    cropped_image.save(element_screenshot_path)
    return element_screenshot_path

def preprocess_image(self, image_path):
    """Resize and preprocess image for CNN."""
    image = Image.open(image_path).convert('L')
    image = image.resize((48, 48))
    return np.array(image)

def fetch_html(self, url):
    """Fetch the HTML of a webpage."""
    try:
        self.driver.get(url)
        time.sleep(3)
        response = requests.get(url, timeout=3)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return self.driver.page_source
    except RequestException as e:
        print(f"Error fetching {url}: {e}")

def is_element_visible(self, element):
    try:        
        # Check if the element is displayed using is_displayed()
        if element.is_displayed():
            # Check if the element is within the viewport using JavaScript
            is_in_viewport = self.driver.execute_script("""
                var elem = arguments[0];
                var rect = elem.getBoundingClientRect();
                return (
                    rect.top >= 0 &&
                    rect.left >= 0 &&
                    rect.bottom <= (window.innerHeight || document.documentElement.clientHeight) &&
                    rect.right <= (window.innerWidth || document.documentElement.clientWidth)
                );
            """, element)
            
            if is_in_viewport:
                return True
            else:
                return False
        else:
            return False
    
    except TimeoutException:
        print("Timeout: Element not found or not visible within the timeout period.")
        return False
    except NoSuchElementException:
        print("Error: Element not found.")
        return False
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return False

def scrape_site(self, link):        
    try:
        html_content = self.fetch_html(link)
        if html_content:
            soup = BeautifulSoup(html_content, "html.parser")

            # Set window size to current width, and max height
            window_size = self.driver.get_window_size()                            
            full_page_height = self.driver.execute_script("return document.documentElement.scrollHeight;") 
            self.driver.set_window_size(window_size['width'],full_page_height)
            self.driver.save_screenshot(SCREENSHOT_PATH) 
            
            screenshot = Image.open(SCREENSHOT_PATH)
            screenshot_width, screenshot_height = screenshot.size
            
            ratio = screenshot_width / screenshot_height    
            screenshot = screenshot.resize((window_size['width'], int(window_size['width'] / ratio)))
            screenshot.save(SCREENSHOT_PATH)

            # Extract target elements from body
            for element in soup.find_all(targetElements):
                text = element.get_text(strip=True)
                if text:
                    try:
                        selector = self.find_unique_xpath(element, soup)
                                                    
                        if (not selector):
                            continue
                        
                        driverElement = self.driver.find_element(By.XPATH, selector)
                        
                        if driverElement: 
                            is_visible = self.is_element_visible(driverElement)
                            
                            if (not is_visible):
                                continue
                                
                            screenshot_path = self.screenshot_element(driverElement)
                            image = self.preprocess_image(screenshot_path)                        
                            contrast_ratio = self.calculate_contrast(Image.fromarray(image.squeeze()))
                            self.X.append(image)
                            self.y.append(1 if contrast_ratio >= CONTRAST_THRESHOLD else 0)
                            self.selector_map[len(self.X) - 1] = selector
                    except Exception as e:
                        print(f"{e}")
    except Exception as e:
        print(f"Error processing {link}: {e}")

def scrape_and_process_data(self):
    """Scrape and process data into X and y."""
    visited_urls = set()
    scraped_links = self.scrape_links(START_URL, visited_urls)

    for link in scraped_links:
        self.scrape_site(link)

    print(self.y)
        
    self.X = np.array(self.X)
    self.y = np.array(self.y)

def train_model(self):
    """Train the CNN model."""
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
    smote = SMOTE(sampling_strategy='auto', k_neighbors=1)
    X_flattened = X_train.reshape(X_train.shape[0], -1)
    X_resampled, y_resampled = smote.fit_resample(X_flattened, y_train)  
    X_shaped = X_resampled.reshape(X_resampled.shape[0], *IMAGE_SIZE, 1)

    self.model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(*IMAGE_SIZE, 1)),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(128, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(2, activation='softmax')
    ])

    self.model.compile(optimizer='adam',
                       loss='sparse_categorical_crossentropy',
                       metrics=['accuracy'])

    self.model.fit(X_shaped, y_resampled, epochs=20, validation_data=(X_test, y_test))
    loss, accuracy = self.model.evaluate(X_test, y_test)
    print(f"Loss: {loss}, Accuracy: {accuracy}")

def close_driver(self):
    """Close the web driver after processing."""
    self.driver.quit()

def export_model(self, path):
    """Export the trained model to a file."""
    self.model.export(path)

def cleanup(self):
    os.remove(SCREENSHOT_PATH)
    try:
        for filename in os.listdir(OUTPUT_DIR):
            file_path = os.path.join(OUTPUT_DIR, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
        print("All files removed successfully.")
    except Exception as e:
        print(f"Error: {e}")

scrape_and_process_data()
train_model()
export_model("HTMLConvolutional")
close_driver()
cleanup()

Message: no such element: Unable to locate element: {"method":"xpath","selector":"/html[1]/body[1]/div[contains(@class, 'main-container homepage main-content-bg top-space')][1]/div[contains(@class, 'main-category-container flex')][1]/div[contains(@class, 'main-category-holder')][1]/div[contains(@class, 'top-box-holder flex')][1]/div[contains(@class, 'right-part')][1]/div[contains(@class, 'other-news-holder flex')][1]/a[contains(@class, 'news-item-container vijesti-text-hover scale-img-hover last-item')][1]/div[contains(@class, 'news-item-holder flex')][1]/h3[contains(@class, 'title')][1]"}
  (Session info: chrome=130.0.6723.117); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
0   chromedriver                        0x0000000104d0d5dc cxxbridge1$str$ptr + 3653648
1   chromedriver                        0x0000000104d05e3c cxxbridge1$str$ptr + 3623024
2   chromedriver             

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.5388 - loss: 19.7895 - val_accuracy: 0.5417 - val_loss: 0.9290
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.6850 - loss: 0.9184 - val_accuracy: 0.8333 - val_loss: 0.3868
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9503 - loss: 0.1649 - val_accuracy: 0.7917 - val_loss: 0.9017
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.8644 - loss: 0.2716 - val_accuracy: 0.8333 - val_loss: 0.2470
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.9539 - loss: 0.1622 - val_accuracy: 0.8333 - val_loss: 0.3931
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.9807 - loss: 0.1131 - val_accuracy: 0.8750 - val_loss: 0.2147
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

INFO:tensorflow:Assets written to: HTMLConvolutional/assets


Saved artifact at 'HTMLConvolutional'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 48, 48, 1), dtype=tf.float32, name='keras_tensor_200')
Output Type:
  TensorSpec(shape=(None, 2), dtype=tf.float32, name=None)
Captures:
  15328404944: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328405328: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328405520: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328406480: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328406672: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328407248: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328407440: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328408016: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328408208: TensorSpec(shape=(), dtype=tf.resource, name=None)
  15328408784: TensorSpec(shape=(), dtype=tf.resource, name=None)
All files removed successfully.
