In [99]:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from keras import layers, models
from PIL import Image
import numpy as np
import os
import random
import tensorflow as tf

# Scrape configuration
MAX_PAGES = 1
START_URL = "https://www.nytimes.com/timeswire"  # Replace with a seed URL
IMAGE_SIZE = (48, 48)
CONTRAST_THRESHOLD = 4.5
OUTPUT_DIR = "scraped_images"
SCREENSHOT_PATH = "full_page_screenshot.png"
FINAL_SCREENSHOT_PATH = "final_screenshot.png"
chrome_driver_path = '/opt/homebrew/bin/chromedriver'
targetElements = ['h1', 'h3']

class CustomModel:
    def __init__(self, model=None):
        self.model = model
        self.driver = self._initialize_webdriver()
        self.css_selector_map = {}
        self.X = []
        self.y = []

    def _initialize_webdriver(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(service=Service(chrome_driver_path), options=chrome_options)
        
        return driver

    def get_css_selector(self, element):
        """Generates a unique CSS selector for a given HTML element."""
        selector = element.name
        if element.get('id'):
            selector += f'#{element["id"]}'
        if element.get('class'):
            selector += '.' + '.'.join(element["class"])
        for attribute, value in element.attrs.items():
            if attribute not in ['id', 'class']:
                selector += f'[{attribute}="{value}"]'
        return selector

    def calculate_contrast(self, image):
        """Calculate contrast ratio using WCAG guidelines."""
        grayscale_image = image.convert('L')
        pixel_values = np.array(grayscale_image)
        min_intensity = pixel_values.min()
        max_intensity = pixel_values.max()
        
        l1 = (max_intensity + 0.05) / 255
        l2 = (min_intensity + 0.05) / 255
        contrast_ratio = (l1 + 0.05) / (l2 + 0.05) if l1 > l2 else (l2 + 0.05) / (l1 + 0.05)
        
        return contrast_ratio

    def scrape_links(self, url, visited, max_pages=MAX_PAGES):
        """Crawl the web starting from a URL, returning a list of links."""
        to_visit = [url]
        links = []
        while to_visit and len(links) < max_pages:
            current_url = to_visit.pop(0)
            if current_url in visited:
                continue
            try:
                response = requests.get(url, timeout=3)
                if response.status_code == 200:
                    soup = BeautifulSoup(self.driver.page_source, "html.parser")
                    visited.add(current_url)
                    links.append(current_url)
                    # Add new links to the queue
                    for anchor in soup.find_all("a", href=True):
                        href = anchor["href"]
                        if href.startswith("http"):
                            to_visit.append(href)
            except Exception as e:
                print(f"Failed to process {current_url}: {e}")
        return links[:max_pages]

    def screenshot_element(self, element):    
        """Takes a screenshot of the target element and saves it."""
        element_location = element.location
        element_size = element.size
        screenshot = Image.open(SCREENSHOT_PATH)
        screenshot_width, screenshot_height = screenshot.size
        
        element_left = element_location['x']
        element_top = element_location['y']
        element_right = min(screenshot_width, element_left + element_size['width']) 
        element_bottom = min(element_top + element_size['height'], screenshot_height)
        
        # Crop the image using Pillow
        cropped_image = screenshot.crop((element_left, element_top, element_right, element_bottom))

        if not os.path.exists(OUTPUT_DIR):
            os.makedirs(OUTPUT_DIR)
        
        element_screenshot_path = os.path.join(OUTPUT_DIR, f"screenshot_{random.randint(1, 1000000)}.png")
        cropped_image.save(element_screenshot_path)
        return element_screenshot_path

    def preprocess_image(self, image_path):
        """Resize and preprocess image for CNN."""
        image = Image.open(image_path).convert('L')
        image = image.resize((48, 48))
        return np.array(image)

    def fetch_html(self, url):
        """Fetch the HTML of a webpage."""
        try:
            self.driver.get(url)
            time.sleep(3)
            response = requests.get(url, timeout=3)
            response.raise_for_status()  # Will raise an exception for HTTP errors
            return self.driver.page_source
        except RequestException as e:
            print(f"Error fetching {url}: {e}")

    def scrape_and_process_data(self):
        """Scrape and process data into X and y."""
        visited_urls = set()
        scraped_links = self.scrape_links(START_URL, visited_urls)

        for link in scraped_links:
            try:
                html_content = self.fetch_html(link)
                if html_content:
                    soup = BeautifulSoup(html_content, "html.parser")

                    # Set window size to current width, and max height
                    window_size = self.driver.get_window_size()                            
                    full_page_height = self.driver.execute_script("return document.documentElement.scrollHeight;") 
                    self.driver.set_window_size(window_size['width'],full_page_height)
                    self.driver.save_screenshot(SCREENSHOT_PATH) 
                    
                    screenshot = Image.open(SCREENSHOT_PATH)
                    screenshot_width, screenshot_height = screenshot.size
                    
                    ratio = screenshot_width / screenshot_height    
                    screenshot = screenshot.resize((window_size['width'], int(window_size['width'] / ratio)))
                    screenshot.save(SCREENSHOT_PATH)

                    # Extract target elements from body
                    for element in soup.find('body').find_all(targetElements):
                        text = element.get_text(strip=True)
                        if text:
                            try:
                                css_selector = self.get_css_selector(element)
                                driverElements = self.driver.find_elements(By.CSS_SELECTOR, css_selector)

                                if driverElements:
                                    for driverElement in driverElements:                               
                                        screenshot_path = self.screenshot_element(driverElement)
                                        image = self.preprocess_image(screenshot_path)                        
                                        contrast_ratio = self.calculate_contrast(Image.fromarray(image.squeeze()))
                                        self.X.append(image)
                                        self.y.append(1 if contrast_ratio >= CONTRAST_THRESHOLD else 0)
                                        self.css_selector_map[len(self.X) - 1] = css_selector
                            except Exception as e:
                                print(f"Error processing element with selector {css_selector}: {e}")
            except Exception as e:
                print(f"Error processing {link}: {e}")

        self.X = np.array(self.X)
        self.y = np.array(self.y)

    def train_model(self):
        """Train the CNN model."""
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        smote = SMOTE(sampling_strategy='auto', k_neighbors=3)
        X_flattened = X_train.reshape(X_train.shape[0], -1)
        X_resampled, y_resampled = smote.fit_resample(X_flattened, y_train)  
        X_shaped = X_resampled.reshape(X_resampled.shape[0], *IMAGE_SIZE, 1)

        self.model = models.Sequential([
            layers.Conv2D(32, (3, 3), activation='relu', input_shape=(*IMAGE_SIZE, 1)),
            layers.MaxPooling2D((2, 2)),
            layers.Conv2D(64, (3, 3), activation='relu'),
            layers.MaxPooling2D((2, 2)),
            layers.Conv2D(128, (3, 3), activation='relu'),
            layers.MaxPooling2D((2, 2)),
            layers.Flatten(),
            layers.Dense(128, activation='relu'),
            layers.Dense(2, activation='softmax')
        ])

        self.model.compile(optimizer='adam',
                           loss='sparse_categorical_crossentropy',
                           metrics=['accuracy'])

        self.model.fit(X_shaped, y_resampled, epochs=10, validation_data=(X_test, y_test))
        loss, accuracy = self.model.evaluate(X_test, y_test)
        print(f"Loss: {loss}, Accuracy: {accuracy}")

    @tf.function(input_signature=[tf.TensorSpec(shape=[None, 48, 48, 1], dtype=tf.float32)])
    def custom_infer(self, elements): 
        predictions = self.model(elements)
        predicted_classes = tf.argmax(predictions, axis=1)
        prediction_results = []

        for idx, prediction in enumerate(predictions):
            css_selector = self.css_selector_map.get(idx)
            if css_selector:
                prediction_results.append({
                    "css_selector": css_selector,
                    "prediction": int(predicted_classes[idx])
                })
                if int(predicted_clases[idx]) == 0:  # Low contrast
                    self.driver.execute_script(
                        "arguments[0].style.border = '3px solid red';",
                        self.driver.find_element(By.CSS_SELECTOR, css_selector)
                    )
        self.driver.save_screenshot(FINAL_SCREENSHOT_PATH)
        return prediction_results

    def __call__(self, elements):
        """Override the __call__ method to run custom inference."""
        return self.custom_infer(elements)

    def close_driver(self):
        """Close the web driver after processing."""
        self.driver.quit()

    def export_model(self, path):
        """Export the trained model to a file."""
        self.model.export(path)

# Usage Example:
custom_model = CustomModel()
custom_model.scrape_and_process_data()
custom_model.train_model()
custom_model.export_model("HTMLConvolutional")
custom_model.close_driver()


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.5646 - loss: 39.9752 - val_accuracy: 0.2381 - val_loss: 11.7979
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.5396 - loss: 6.6786 - val_accuracy: 0.7619 - val_loss: 2.3776
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.5177 - loss: 3.6081 - val_accuracy: 0.4762 - val_loss: 0.9126
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.6438 - loss: 1.2807 - val_accuracy: 0.2381 - val_loss: 1.9804
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.6115 - loss: 0.7549 - val_accuracy: 0.9524 - val_loss: 0.3340
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step - accuracy: 0.8760 - loss: 0.2726 - val_accuracy: 0.9524 - val_loss: 0.3159
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

INFO:tensorflow:Assets written to: HTMLConvolutional/assets


Saved artifact at 'HTMLConvolutional'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 48, 48, 1), dtype=tf.float32, name='keras_tensor_470')
Output Type:
  TensorSpec(shape=(None, 2), dtype=tf.float32, name=None)
Captures:
  14892226448: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892227984: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892227792: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892228752: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892228560: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892229520: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892229328: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892230288: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892230096: TensorSpec(shape=(), dtype=tf.resource, name=None)
  14892231056: TensorSpec(shape=(), dtype=tf.resource, name=None)
