In [90]:
import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from tensorflow import keras
from keras import layers, models
from keras.export import ExportArchive
from keras.preprocessing.image import load_img, img_to_array
from PIL import Image
import numpy as np
import os
import random

# Scrape configuration
MAX_PAGES = 1
START_URL = "https://www.nytimes.com/timeswire"  # Replace with a seed URL
IMAGE_SIZE = (48, 48)
CONTRAST_THRESHOLD = 4.5
OUTPUT_DIR = "scraped_images"
SCREENSHOT_PATH = "full_page_screenshot.png"
SCREENSHOT_PATH = "full_page_screenshot.png"

OUTPUT_DIR = "scraped_images"
os.makedirs(OUTPUT_DIR, exist_ok=True)

chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

driver_path = '/opt/homebrew/bin/chromedriver'
driver = webdriver.Chrome(service=Service(driver_path), options=chrome_options)

targetElements = ['h1', 'h3']

def get_css_selector(element):
    selector = element.name
    
    if element.get('id'):
        selector += f'#{element["id"]}'
    
    if element.get('class'):
        selector += '.' + '.'.join(element["class"])
    
    for attribute, value in element.attrs.items():
        if attribute not in ['id', 'class']:
            selector += f'[{attribute}="{value}"]'
    
    return selector

def calculate_contrast(image):
    """Calculate contrast ratio using WCAG guidelines."""
    grayscale_image = image.convert('L')
    pixel_values = np.array(grayscale_image)

    min_intensity = pixel_values.min()
    max_intensity = pixel_values.max()
    
    l1 = (max_intensity + 0.05) / 255
    l2 = (min_intensity + 0.05) / 255
    
    contrast_ratio = (l1 + 0.05) / (l2 + 0.05) if l1 > l2 else (l2 + 0.05) / (l1 + 0.05)
    
    return contrast_ratio

def scrape_links(url, visited, max_pages=MAX_PAGES):
    """Crawl the web starting from a URL, returning a list of links."""
    to_visit = [url]
    links = []

    while to_visit and len(links) < max_pages:
        current_url = to_visit.pop(0)
        if current_url in visited:
            continue
        
        try:
            driverResponse = driver.get(url)
            response = requests.get(url, timeout=3)
            
            if response.status_code == 200:
                soup = BeautifulSoup(driver.page_source, "html.parser")
                visited.add(current_url)
                links.append(current_url)
                
                # Add new links to the queue
                for anchor in soup.find_all("a", href=True):
                    href = anchor["href"]
                    if href.startswith("http"):
                        to_visit.append(href)
        except Exception as e:
            print(f"Failed to process {current_url}: {e}")
    
    return links[:max_pages]


def screenshot_element(driver, element):    
    element_location = element.location
    element_size = element.size
    
    screenshot = Image.open(SCREENSHOT_PATH)
    screenshot_width, screenshot_height = screenshot.size
    
    element_left = element_location['x']
    element_top = element_location['y']
    element_right = min(screenshot_width, element_left + element_size['width']) 
    element_bottom = min(element_top + element_size['height'], screenshot_height)
    
    # Crop the image using Pillow
    cropped_image = screenshot.crop((element_left, element_top, element_right, element_bottom))

    # Step 4: Save the cropped screenshot
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    
    element_screenshot_path = os.path.join(OUTPUT_DIR, f"screenshot_{random.randint(1, 1000000)}.png")
    cropped_image.save(element_screenshot_path)

    # Return the path of the cropped screenshot
    return element_screenshot_path

def preprocess_image(image_path):
    """Resize and preprocess image for CNN."""
    image = Image.open(image_path).convert('L')
    image = image.resize((48,48))
    
    return np.array(image)

def fetch_html(url):
    try:
        driverResponse = driver.get(url)
        time.sleep(3)
        response = requests.get(url, timeout=3)
        response.raise_for_status()  # Will raise an exception for HTTP errors
        return driver.page_source
    except RequestException as e:
        print(f"Error fetching {url}: {e}")

# Scrape and process data
visited_urls = set()
scraped_links = scrape_links(START_URL, visited_urls)

X, y = [], []
css_selector_map = {}

for link in scraped_links:
    try:
        html_content = fetch_html(link)
        if html_content:
            soup = BeautifulSoup(html_content, "html.parser")

            # Set window size to current width, and max height
            window_size = driver.get_window_size()                            
            full_page_height = driver.execute_script("return document.documentElement.scrollHeight;") 
            driver.set_window_size(window_size['width'],full_page_height)
            driver.save_screenshot(SCREENSHOT_PATH) 
            
            screenshot = Image.open(SCREENSHOT_PATH)
            screenshot_width, screenshot_height = screenshot.size
            
            ratio = screenshot_width / screenshot_height    
            screenshot = screenshot.resize((window_size['width'], int(window_size['width'] / ratio)))
            screenshot.save(SCREENSHOT_PATH)
    
            # Extract target elements from body
            for element in soup.find('body').find_all(targetElements):
                text = element.get_text(strip=True)

                # Only if the element contains text
                if text:
                    try:
                        css_selector = get_css_selector(element)
                        driverElements = driver.find_elements(By.CSS_SELECTOR, css_selector)
                
                        if driverElements:                        
                            for driverElement in driverElements:                               
                                screenshot_path = screenshot_element(driver, driverElement)
                                image = preprocess_image(screenshot_path)                        
                                contrast_ratio = calculate_contrast(Image.fromarray(image.squeeze()))
                                X.append(image)
                                y.append(1 if contrast_ratio >= CONTRAST_THRESHOLD else 0)
                                css_selector_map[len(X) - 1] = css_selector  # Index of the current image in X
                    except Exception as e:
                            print(f"Error processing element with selector {css_selector}: {e}")
                else: continue
        else: continue
    except Exception as e:
        print(f"Error processing {link}: {e}")

print(y)

[1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0]


In [91]:
# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

# Train-test split and balance the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class_distribution = Counter(y_train)

# Get the number of samples in the minority class
min_class = min(class_distribution, key=class_distribution.get)
min_class_samples = class_distribution[min_class]

smote = SMOTE(sampling_strategy='auto', k_neighbors=min(3, min_class_samples))

X_flattened = X_train.reshape(X_train.shape[0], -1)
X_resampled, y_resampled = smote.fit_resample(X_flattened, y_train)  
X_shaped = X_resampled.reshape(X_resampled.shape[0], *IMAGE_SIZE, 1)
                     
# Define the CNN
cnn_model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(*IMAGE_SIZE, 1)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(2, activation='softmax')
])

cnn_model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Train the CNN
cnn_model.fit(X_shaped, y_resampled, epochs=10, validation_data=(X_test, y_test))

# Evaluate the CNN
loss, accuracy = cnn_model.evaluate(X_test, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

def custom_infer(elements): 
    predictions = cnn_model(elements)

    # Step 2: Get predicted classes (for binary classification)
    predicted_classes = tf.argmax(predictions, axis=1)
    
    # Initialize a list to store the predictions with css selectors
    prediction_results = []
    
    # Iterate through the predictions and map them to CSS selectors
    for idx, prediction in enumerate(predictions):
        css_selector = css_selector_map.get(idx)  # Get the CSS selector for this index
        
        if css_selector:
            # Append the prediction with the css_selector to the result list
            prediction_results.append({
                "css_selector": css_selector,
                "prediction": int(predicted_classes[idx])
            })
            
            if int(predicted_classes[idx]) == 0:  # Low contrast (adjust this condition as needed)
                driver.execute_script(
                    "arguments[0].style.border = '3px solid red';",
                    driver.find_element(By.CSS_SELECTOR, css_selector)
                )
        
    driver.save_screenshot(FINAL_SCREENSHOT_PATH)  # Save the screenshot
    return prediction_results

driver.quit()

cnn_model.export("HTMLConvolutional")

serving_model = tf.saved_model.load("HTMLConvolutional")
serving_model.__call__

export_archive = ExportArchive()
export_archive.track(model)
export_archive.add_endpoint(
    name="serve",
    fn=custom_infer,
    input_signature=[tf.TensorSpec(shape=[None, 48, 48, 1], dtype=tf.float32)],
)
print(export_archive)
export_archive.write_out("HTMLConvolutional2")

# # Step 3: Generate the confusion matrix
# conf_matrix = confusion_matrix(y_test, predicted_classes)

# # Step 4: Display the confusion matrix
# disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix)
# disp.plot(cmap=plt.cm.Blues)
# plt.title('Confusion Matrix')
# plt.show()

# # Print out the confusion matrix
# print('Confusion Matrix:\n', conf_matrix)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 84ms/step - accuracy: 0.4477 - loss: 28.9915 - val_accuracy: 0.4762 - val_loss: 0.8394
Epoch 2/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.5273 - loss: 3.1803 - val_accuracy: 0.4762 - val_loss: 4.0148
Epoch 3/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.4648 - loss: 3.9782 - val_accuracy: 0.8571 - val_loss: 0.4054
Epoch 4/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.6492 - loss: 0.7675 - val_accuracy: 0.5714 - val_loss: 0.5553
Epoch 5/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.5781 - loss: 0.5055 - val_accuracy: 0.8095 - val_loss: 0.3606
Epoch 6/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.8414 - loss: 0.3317 - val_accuracy: 0.9524 - val_loss: 0.4070
Epoch 7/10
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

OperatorNotAllowedInGraphError: Iterating over a symbolic `tf.Tensor` is not allowed. You can attempt the following resolutions to the problem: If you are running in Graph mode, use Eager execution mode or decorate this function with @tf.function. If you are using AutoGraph, you can try decorating this function with @tf.function. If that does not work, then you may be using an unsupported feature or your source code may not be visible to AutoGraph. See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code for more information.