In [None]:
!pip install mysql-connector-python

import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import pathlib
import tarfile
import sqlite3

from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout, BatchNormalization, Multiply, Add, Reshape, TextVectorization
from tensorflow.keras.activations import sigmoid, tanh
from tensorflow.keras.metrics import Precision, Recall, F1Score
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input, decode_predictions
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split

from PIL import Image
import mysql.connector
from collections import defaultdict, Counter
import unicodedata

In [None]:
connection = mysql.connector.connect(
    host="",
    user="",
    password="",
    database=""
)

cursor = connection.cursor(dictionary=True)

query = """
SELECT
    i.page_id,
    i.hash,
    i.file_name,
    i.alt_text,
    i.image_title,
    i.image_caption,
    i.width,
    i.height,
    i.contains_transparency,
    i.wrapped_element,
    i.semantic_context,
    i.is_link,
    i.is_button,
    i.file_format,
    i.is_decorative,
    i.headline_above_image,
    p.title,
    p.meta_description,
    p.top_headline,
    p.word_count,
    p.image_count,
    p.external_link_count,
    p.internal_link_count,
    COUNT(*) AS context_frequency
FROM
    image i
JOIN
    page p ON i.page_id = p.id
GROUP BY
    i.hash,
    i.alt_text,
    i.image_title,
    i.image_caption,
    i.width,
    i.height,
    i.contains_transparency,
    i.wrapped_element,
    i.semantic_context,
    i.headline_above_image,
    i.is_link,
    i.is_button,
    i.file_format,
    i.is_decorative;
"""
cursor.execute(query)

image_data = cursor.fetchall()

In [None]:
dataset_url = ""
data_dir = tf.keras.utils.get_file(origin=dataset_url, extract=False, force_download=True)

gz_path = pathlib.Path(data_dir)
tar_path = gz_path.with_suffix('')
gz_path.rename(tar_path)

with tarfile.open(tar_path, "r") as tar:
    tar.extractall(path=tar_path.parent)

all_images = list(tar_path.parent.glob('*.*'))
image_count = len(all_images)
print("Anzahl der Bilder:", image_count)

def load_image(file_name, target_size=(256, 256)):
    image_path = tar_path.parent / file_name
    if image_path.exists():
        image = Image.open(image_path)

        if image.mode != 'RGB':
            image = image.convert('RGB')

        image = image.resize(target_size)
        return image
    else:
        return None

**PARAMETER**

In [None]:
image_height, image_width = 256, 256
epochs = 15
batch_size = 32

**DATA HANDLING & PRE-PROCESSING**

Step 1: aggregation

In [None]:
unique_hashes = np.unique([entry['hash'] for entry in image_data])
aggregated_data = []

for hash_value in unique_hashes:
    entries_for_hash = [entry for entry in image_data if entry['hash'] == hash_value]

    # aggregation of numeric values
    aggregated_context_frequency = np.sum([entry['context_frequency'] for entry in entries_for_hash])
    aggregated_word_count = np.mean([entry['word_count'] for entry in entries_for_hash])
    aggregated_image_count = np.mean([entry['image_count'] for entry in entries_for_hash])
    aggregated_external_link_count = np.mean([entry['external_link_count'] for entry in entries_for_hash])
    aggregated_internal_link_count = np.mean([entry['internal_link_count'] for entry in entries_for_hash])

    # tetxtual features
    alt_texts = [entry['alt_text'] for entry in entries_for_hash]
    most_common_alt_text = Counter(alt_texts).most_common(1)[0][0]

    headlines_above_image = [entry['headline_above_image'] for entry in entries_for_hash]
    most_common_headline = Counter(headlines_above_image).most_common(1)[0][0]

    wrapped_elements = [entry['wrapped_element'] for entry in entries_for_hash]
    most_common_wrapped_element = Counter(wrapped_elements).most_common(1)[0][0]

    aggregated_data.append({
        'hash': hash_value,
        'alt_text': most_common_alt_text,
        'image_title': entries_for_hash[0]['image_title'],
        'image_caption': entries_for_hash[0]['image_caption'],
        'width': entries_for_hash[0]['width'],
        'height': entries_for_hash[0]['height'],
        'file_format': entries_for_hash[0]['file_format'],
        'file_name': entries_for_hash[0]['file_name'],
        'contains_transparency': entries_for_hash[0]['contains_transparency'],
        'wrapped_element': most_common_wrapped_element,
        'semantic_context': entries_for_hash[0]['semantic_context'],
        'headline_above_image': most_common_headline,
        'title': entries_for_hash[0]['title'],
        'meta_description': entries_for_hash[0]['meta_description'],
        'top_headline': entries_for_hash[0]['top_headline'],
        'word_count': aggregated_word_count,
        'image_count': aggregated_image_count,
        'context_frequency': aggregated_context_frequency,
        'is_link': entries_for_hash[0]['is_link'],
        'is_button': entries_for_hash[0]['is_button'],
        'external_link_count': aggregated_external_link_count,
        'internal_link_count': aggregated_internal_link_count,
        'is_decorative': entries_for_hash[0]['is_decorative']
    })

Step 2: z-score normalization

In [None]:
numerical_features = ['width', 'height', 'word_count', 'image_count', 'external_link_count', 'internal_link_count', 'context_frequency']
numerical_data = np.array([[entry.get(feature, 0) for feature in numerical_features] for entry in aggregated_data])
scaler = StandardScaler()
numerical_data_scaled = scaler.fit_transform(numerical_data)
for i in range(len(aggregated_data)):
    for j, feature in enumerate(numerical_features):
        aggregated_data[i][feature] = numerical_data_scaled[i, j]

Step 3: Text vectorization

In [None]:
def normalize_unicode(text):
    normalized_text = unicodedata.normalize('NFKC', text)
    cleaned_text = ''.join(c for c in normalized_text if unicodedata.category(c) != 'Cc' and unicodedata.category(c) != 'Cs')
    return cleaned_text

text_data = []
for entry in aggregated_data:
    alt_text = str(entry['alt_text']).encode('utf-8').decode('utf-8') if entry['alt_text'] is not None else ''
    image_title = str(entry['image_title']).encode('utf-8').decode('utf-8') if entry['image_title'] is not None else ''
    image_caption = str(entry['image_caption']).encode('utf-8').decode('utf-8') if entry['image_caption'] is not None else ''
    headline_above_image = str(entry['headline_above_image']).encode('utf-8').decode('utf-8') if entry['headline_above_image'] is not None else ''
    title = str(entry['title']).encode('utf-8').decode('utf-8') if entry['title'] is not None else ''
    meta_description = str(entry['meta_description']).encode('utf-8').decode('utf-8') if entry['meta_description'] is not None else ''
    top_headline = str(entry['top_headline']).encode('utf-8').decode('utf-8') if entry['top_headline'] is not None else ''
    file_name = str(entry['file_name']).encode('utf-8').decode('utf-8') if entry['file_name'] is not None else ''

    processed_texts = [
        normalize_unicode(alt_text),
        normalize_unicode(image_title),
        normalize_unicode(image_caption),
        normalize_unicode(headline_above_image),
        normalize_unicode(title),
        normalize_unicode(meta_description),
        normalize_unicode(top_headline),
        normalize_unicode(file_name)
    ]
    text_data.extend(processed_texts)

vectorize_layer = TextVectorization(output_mode="tf_idf")
vectorize_layer.adapt(text_data)


Step 4: create new dataset and one-hot encode features

In [None]:
def extract_combined_metadata(entry):
    image_metadata = {
        'width': entry.get('width', 0),
        'height': entry.get('height', 0),
        'contains_transparency': entry.get('contains_transparency', False),
        'is_link': entry.get('is_link', False),
        'is_button': entry.get('is_button', False),
        'context_frequency': entry.get('context_frequency', 0)
    }
    page_metadata = {
        'word_count': entry.get('word_count', 0),
        'image_count': entry.get('image_count', 0),
        'external_link_count': entry.get('external_link_count', 0),
        'internal_link_count': entry.get('internal_link_count', 0)
    }
    combined_metadata = {**image_metadata, **page_metadata}
    return list(combined_metadata.values())


file_formats = ['svg', 'png', 'jpeg', 'webp']
file_format_mapping = {format: idx for idx, format in enumerate(file_formats)}

semantic_contexts = [None, 'article', 'aside', 'footer', 'header', 'main', 'nav', 'section']
semantic_context_mapping = {context: idx for idx, context in enumerate(semantic_contexts)}

wrapped_elements = ['a', 'article', 'aside', 'div', 'figure', 'li', 'p', 'picture', 'span', 'strong']
wrapped_element_mapping = {element: idx for idx, element in enumerate(wrapped_elements)}

image_arrays = []
metadata_arrays = []
label_arrays = []
for entry in aggregated_data:
    file_format = entry['file_format']
    if entry['file_format'] == 'svg':
        file_format = 'png'
    file_name = f"{entry['hash']}.{file_format.lower()}"
    image = load_image(file_name)

    if image:
        if image.mode == "P" and "transparency" in image.info:
            image = image.convert("RGBA")
        image = image.resize((image_height, image_width))
        image_array = img_to_array(image)
        image_array = preprocess_input(image_array)

        image_arrays.append(image_array)
        metadata = extract_combined_metadata(entry)

        # One-Hot-Encoding for file_format
        file_format_vector = [0] * len(file_formats)
        format_index = file_format_mapping.get(entry['file_format'], -1)
        if format_index != -1:
            file_format_vector[format_index] = 1
        metadata.extend(file_format_vector)

        # One-Hot-Encoding for semantic_context
        semantic_context_vector = [0] * len(semantic_contexts)
        semantic_context_index = semantic_context_mapping.get(entry['semantic_context'], -1)
        if semantic_context_index != -1:
            semantic_context_vector[semantic_context_index] = 1
        metadata.extend(semantic_context_vector)

        # One-Hot-Encoding for wrapped_element
        wrapped_element_vector = [0] * len(wrapped_elements)
        wrapped_index = wrapped_element_mapping.get(entry['wrapped_element'], -1)
        if wrapped_index != -1:
            wrapped_element_vector[wrapped_index] = 1
        metadata.extend(wrapped_element_vector)

        text_features = [
            str(entry['headline_above_image']) if entry['headline_above_image'] is not None else '',
            str(entry['title']) if entry['title'] is not None else '',
            str(entry['meta_description']) if entry['meta_description'] is not None else '',
            str(entry['top_headline']) if entry['top_headline'] is not None else '',
            str(entry['alt_text']) if entry['alt_text'] is not None else '',
            str(entry['image_title']) if entry['image_title'] is not None else '',
            str(entry['image_caption']) if entry['image_caption'] is not None else '',
            str(entry['file_name']) if entry['file_name'] is not None else ''
        ]

        vectorized_text = vectorize_layer(text_features)
        vectorized_text_flat = tf.reshape(vectorized_text, [-1])
        metadata.extend(vectorized_text_flat.numpy().tolist())

        metadata_arrays.append(metadata)
        label_arrays.append(entry['is_decorative'])

print(metadata_arrays[:1])

X_final_metadata = np.array(image_arrays)
X_metadata = np.array(metadata_arrays)
y_labels = np.array(label_arrays)

**MODEL**

In [None]:
def MetaBlock(V, U):
  # V image input, U metadata input
    U_dim = U.shape[-1]
    V_dim = V.shape[-1]

    t1 = Dense(V_dim)(U)
    t1 = BatchNormalization()(t1)
    t1 = Reshape((1, V_dim))(t1)

    t2 = Dense(V_dim)(U)
    t2 = BatchNormalization()(t2)
    t2 = Reshape((1, V_dim))(t2)

    V_modulated = Multiply()([sigmoid(tanh(V)), t1])
    V_modulated = Add()([V_modulated, t2])
    return V_modulated

k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

all_train_accuracy = []
all_val_accuracy = []
all_train_loss = []
all_val_loss = []
all_train_precision = []
all_val_precision = []
all_train_recall = []
all_val_recall = []
all_val_f1_score = []

fold_no = 1
for train_idx, val_idx in kf.split(X_images, y_labels):
    print(f'Training on fold {fold_no}/{k_folds}...')

    X_train_images, X_val_images = X_images[train_idx], X_images[val_idx]
    X_train_metadata, X_val_metadata = X_final_metadata[train_idx], X_final_metadata[val_idx]
    y_train, y_val = y_labels[train_idx], y_labels[val_idx]

    resnet = tf.keras.applications.ResNet50(weights='imagenet', include_top=False, input_shape=(image_height, image_width, 3))
    for layer in resnet.layers:
        layer.trainable = False

    # Image Input and Feature Extraction
    image_input = Input(shape=(image_height, image_width, 3))
    x = resnet(image_input)
    x = Flatten()(x)

    # Metadata Input and Dense Layers
    metadata_input = Input(shape=(X_train_metadata.shape[1],))
    y = Dense(128, activation='relu')(metadata_input)
    y = Dropout(0.4)(y)
    y = Dense(64, activation='relu')(y)

    # Combined Features
    modulated_features = MetaBlock(x, y)
    modulated_features = BatchNormalization()(modulated_features)

    z = Dense(64, activation='relu')(modulated_features)
    z = BatchNormalization()(z)
    z = Dropout(0.4)(z)
    z = Dense(32, activation='relu')(z)
    z = BatchNormalization()(z)
    z = Dropout(0.4)(z)

    # Output Layer
    output = Dense(1, activation='sigmoid')(z)
    output = Flatten()(output)

    model = Model(inputs=[image_input, metadata_input], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(name='precision_m'), Recall(name='recall_m')])

    history = model.fit(
        [X_train_images, X_train_metadata], y_train,
        validation_data=([X_val_images, X_val_metadata], y_val),
        epochs=epochs, batch_size=batch_size
    )
    all_train_accuracy.append(history.history['accuracy'])
    all_val_accuracy.append(history.history['val_accuracy'])
    all_train_loss.append(history.history['loss'])
    all_val_loss.append(history.history['val_loss'])
    all_train_precision.append(history.history['precision_m'])
    all_val_precision.append(history.history['val_precision_m'])
    all_train_recall.append(history.history['recall_m'])
    all_val_recall.append(history.history['val_recall_m'])

    fold_no += 1

all_results = {
    'Train Accuracy': np.concatenate(all_train_accuracy),
    'Validation Accuracy': np.concatenate(all_val_accuracy),
    'Train Loss': np.concatenate(all_train_loss),
    'Validation Loss': np.concatenate(all_val_loss),
    'Train Precision': np.concatenate(all_train_precision),
    'Validation Precision': np.concatenate(all_val_precision),
    'Train Recall': np.concatenate(all_train_recall),
    'Validation Recall': np.concatenate(all_val_recall),
}

print(all_results)

print(f'Average accuracy over {k_folds} folds: {np.mean(val_accuracy)}')
print(f'Average precision over {k_folds} folds: {np.mean(val_precision)}')
print(f'Average recall over {k_folds} folds: {np.mean(val_recall)}')
print(f'Average loss over {k_folds} folds: {np.mean(val_loss)}')