In [1]:
import csv
from scipy.special import expit
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, recall_score
import numpy as np
import re
import scipy.stats as stats
import time
import json
from collections import Counter
import random
import itertools
from PIL import Image
import os
import shutil
import datetime
import matplotlib.pyplot as plt
import openpyxl
import matplotlib.image as mpimg
from matplotlib.colors import LinearSegmentedColormap
import pickle
from scipy.stats import spearmanr

In [None]:
from diffusers import AutoPipelineForText2Image
import torch
from diffusers import StableDiffusionXLImg2ImgPipeline
from diffusers.utils import load_image
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

## 1. Notwendige Daten aus MSCOCO extrahieren
Im Folgenden werden aus dem MSCOCO-Datensatz die Daten extrahiert, die für diese Arbeit notwendig sind.
Dazu zählen alle inhaltlichen Annotationen zu den Bildern: 
1. Things (klar abgrenzbare Objekte im Bild)
2. Stuff (nicht eindeutig isolierbares Material im Bild)
3. Captions (Gesamtbeschreibung des Bildes).

Folgende Ordnerstruktur ist notwendig /mscoco-2017/annotations/ mit folgenden Dateien (cocodataset.org):
- captions_train2017.json
- captions_val2017.json
- instances_train2017.json
- instances_val2017.json
- stuff_train2017.json
- stuff_val2017.json

In [2]:
# defining a function to extract the necessary data from the MSCOCO dataset

def extractData(captionsFilePath, thingsFilePath, stuffFilePath) :
    # STEP-1: pairing validation images with their corresponding one-sentence captions and storing them in 'data'

    with open(captionsFilePath) as f:
        annFile = json.load(f)
    data = {}
    captions_dict = {}
    for ann in annFile['annotations']:
        imgId = ann['image_id']
        if imgId not in captions_dict.keys():
            captions_dict[imgId] = [ann['caption']]
        else:
            captions_dict[imgId].append(ann['caption'])

    for img in annFile['images']:
        imgId = img['id']
        currentContainer = {
            "imgName" : img['file_name'],
            "height" : img['height'],
            "width" : img['width'],
            "things" : [],
            "stuff" : [],
            "captions" : captions_dict[imgId]
        }
        data[imgId] = currentContainer
    
    # STEP-2: adding thing classes (things that are shown in the image) to the corresponding image represented in 'data'

    with open(thingsFilePath) as f:
        annFile = json.load(f)
    # creating a dictionary that pairs the categoryId to its corresponding description
    class_dict = {category['id']: category['name'] for category in annFile['categories']}
    # add each thing-description of an annotation to the corresponding image in valData
    for ann in annFile['annotations']:
        data[ann['image_id']]['things'].append(class_dict.get(ann['category_id']))

    # STEP-3: adding stuff classes (stuff that is shown in the image) to the corresponding image represented in 'data'

    with open(stuffFilePath) as f:
        annFile = json.load(f)
    # creating a dictionary that pairs the categoryId to its corresponding description
    class_dict = {category['id']: category['name'] for category in annFile['categories']}
    # add each thing-description of an annotation to the corresponding image in valData
    for ann in annFile['annotations']:
        data[ann['image_id']]['stuff'].append(class_dict.get(ann['category_id']))
    
    print('Created ' + str(len(data)) + ' image-description pairs.')
    return data

In [None]:
# extracting the data and storing it in a JSON file

def useSubDataset(abbreveation):
    if abbreveation == 'val':
        return ['mscoco-2017/annotations/captions_val2017.json', 'mscoco-2017/annotations/instances_val2017.json', 'mscoco-2017/annotations/stuff_val2017.json']
    if abbreveation == 'train':
        return ['mscoco-2017/annotations/captions_train2017.json', 'mscoco-2017/annotations/instances_train2017.json', 'mscoco-2017/annotations/stuff_train2017.json']

valData = extractData(*useSubDataset('val'))
#trainData = extractData(*useSubDataset('train'))
data = valData #| trainData

# filter data for 640x480 imgs
filtered_data = {}
for key, value in data.items():
    if 'height' in value and 'width' in value:  # Sicherstellen, dass die Schlüssel existieren
        if value['height'] == 480 and value['width'] == 640:
            filtered_data[key] = value
data = filtered_data

print('Total dataset contains ' + str(len(data)) + ' image-description pairs.')

with open('data.json', 'w') as json_file:
    json.dump(data, json_file, indent=2)

### 1.1 Promptbasis für die Bildgenerierung auswählen
Alle stuff- und things-Annotationen sowie die 5 Captions zu jedem Bild liegen nun in der data.json Datei vor.
Aus diesen Annotationen wird nun für jedes Bild eine lange und eine kurze Promptbasis ausgewählt aus denen im nächsten Schritt die Prompts für die Bildgenerierung erzeugt werden.

In [2]:
# choosing a short and a long prompt base for each image

with open('data.json') as f:
        data = json.load(f)

# calculating the average caption length in the whole dataset
average_caption_length = 0
for key, content in data.items():
        for caption in content['captions']:
                average_caption_length += len(caption.split())
average_caption_length = round(average_caption_length / (len(data) * 5))

# choosing the long prompt base for each img based on its proximity to the average_caption_length in the whole dataset
def sort_by_proximity(strings, standard_length):
    def custom_sort(string):
        word_count = len(string.split())
        return (abs(word_count - standard_length), word_count)

    sorted_strings = sorted(strings, key=custom_sort)

    return sorted_strings

for key, content in data.items():
      data[key]['prompt_base'] = {}
      data[key]['prompt_base']['long'] = sort_by_proximity(content['captions'], average_caption_length)[0]

# choosing a short prompt base for each img based on its amount of appearances in the img
for key, content in data.items():
        objclasses = content['things'] + content['stuff']
        class_counter = Counter(objclasses)
        most_common_classes = class_counter.most_common()
        if most_common_classes:
              data[key]['prompt_base']['short'] = most_common_classes[0][0]
        else:
              data[key]['prompt_base']['short'] = random.choice(objclasses)

with open('data.json', 'w') as json_file:
    json.dump(data, json_file, indent=2)


## 2. Generierung der Bilder
Im Folgenden werden zu jedem referenzierten MSCOCO-Bild in der data.json Datei zwei Bilder generiert.
Dafür wird jeweils die kurze und die lange Promptbasis genutzt.

Nun muss zunächst der [Fooocus-API Server](https://github.com/mrhan1993/Fooocus-API/) gestartet werden. 

Wird der Fooocus-API-Server über das HdM Deeplearning Cluster betrieben, gibt es SSH-Probleme bei Serveranfragen mit der Python-Requests Bibliothek. Diese Probleme treten nicht auf, schickt man die Anfragen über einen Browser wie z.B. Chrome. Aus diesem Grund wird im Folgenden ein Workaround mit Selenium-Webdirver für Chrome definiert. Dieser muss nicht ausgeführt werden, wenn das Script lokal auf einem PC ausgeführt wird. Dann können die Anfragen vmtl. mit der Requests Bibliothek getätigt werden.

- api_server_adress: die Adresse, unter der der API Server erreichbar ist
- credentials: Login-Passwort für den aktuellen Job im IAAI-Launcher

In [2]:
from credentials import fooocus_login
fooocus_api_server_adress = 'https://cerebro.mi.hdm-stuttgart.de/pm074_42695/proxy/8888/'
credentials = fooocus_login

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--disable-gpu")
options.add_argument("--disable-software-rasterizer")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.get(fooocus_api_server_adress)

input_field = driver.find_element(By.XPATH, "//input[@class='password'][@type='password']")
input_field.send_keys(credentials)
input_field.send_keys(Keys.RETURN)

def generate_fooocus_img_via_chrome(prompt, negative_prompt):
    generation_params = '''{
        prompt: "''' + prompt + '''",
        //negative_prompt: "''' + negative_prompt + '''",
        base_model_name: "juggernautXL_version6Rundiffusion.safetensors",
        style_selections: [],
        refiner_model_name: 'Realistic_Vision_V6.0_NV_B1.safetensors',
        refiner_switch: 0.4,
        aspect_ratios_selection: "640*480",
        async_process: true
    }'''
    js_code = """
    return new Promise((resolve, reject) => {
        const generationParams = """ + generation_params + """;
        const host = '""" + fooocus_api_server_adress + """';

        fetch(`${host}/v1/generation/text-to-image`, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json'
            },
            body: JSON.stringify(generationParams)
        })
        .then(response => {
            if (!response.ok) {
                throw new Error('Network response was not ok');
            }
            return response.json();
        })
        .then(data => resolve(data))
        .catch(error => reject(error));
    });
    """
    result = driver.execute_script(js_code)
    return result

def get_img_id_from_fooocus_job(job_id):
    js_code = """
    return (function() {
        return new Promise((resolve, reject) => {
            const url = new URL('""" + fooocus_api_server_adress + """v1/generation/query-job');
            url.search = new URLSearchParams({
                job_id: '""" + job_id + """',
                require_step_preview: "false"
            }).toString();

            fetch(url, {
                method: "GET",
                headers: {
                    "Accept": "application/json"
                }
            })
            .then(response => {
                if (!response.ok) {
                    throw new Error("Network response was not ok");
                }
                return response.json();
            })
            .then(data => {
                resolve(data); 
            })
            .catch(error => {
                reject(error); 
            });
        });
    })();"""
    
    data = driver.execute_script(js_code)
    return data

def safe_fooocus_img(job, filename):
    job_result = get_img_id_from_fooocus_job(job)['job_result']
    while job_result == None:
        job_result = get_img_id_from_fooocus_job(job)['job_result']
        time.sleep(5)
    # change filepaths according to your system
    !cp /home/stud/p/pm074/Fooocus-API/outputs/files/{job_result[0]['url'][28:]} /home/stud/p/pm074/bachelorarbeit/imgs/mixed/{filename}


In [22]:
def select_ai_model(model):
    #select model
    if model == 'sdxl':
        torch.cuda.empty_cache()
        return AutoPipelineForText2Image.from_pretrained(
            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
        ).to("cuda")
    elif model == 'fooocus':
        return generate_fooocus_img_via_chrome

def select_imgs_from_mscoco(amount_or_list):
    # img selection from mscoco
    with open('data.json') as f:
            data = json.load(f)

    if not type(amount_or_list) == list:
        amount = amount_or_list
        imgs = list(data.keys())
        selected_imgs = imgs[amount[0]:(amount[1] + 1)]

        print(str(len(selected_imgs)) + ' imgs selected from MSCOCO: ' + str(selected_imgs))
    else:
        selected_imgs = amount_or_list
    
    selected_imgs = {key: data[key] for key in selected_imgs}
    return selected_imgs

def generate_imgs(amount_prompt_or_mscocolist, model):
    if not type(amount_prompt_or_mscocolist) == str:
        selected_imgs = select_imgs_from_mscoco(amount_prompt_or_mscocolist)

    pipeline_text2image = select_ai_model(model)

    # img generation with mscoco base
    if not type(amount_prompt_or_mscocolist) == str:
        for img_id, img in selected_imgs.items():
            short_prompt = 'photo of ' + img['prompt_base']['short']
            long_prompt = img['prompt_base']['long']

            if model == 'fooocus':
                job = pipeline_text2image(short_prompt)['job_id']
                current_filepath = str(img_id) + '-short-fooocus.png'
                safe_fooocus_img(job, current_filepath)
                print(current_filepath + ' done.')

                job = pipeline_text2image(long_prompt)['job_id']
                current_filepath = str(img_id) + '-long-fooocus.png'
                safe_fooocus_img(job, current_filepath)
                print(current_filepath + ' done.')
            else:
                short_prompt_img = pipeline_text2image(prompt=short_prompt, height=864, width=1152).images[0]
                short_prompt_img = short_prompt_img.resize((640,480), Image.LANCZOS)
                current_filepath = 'imgs/mixed/' + str(img_id) + '-short-' + str(model) + '.jpg'
                short_prompt_img.save(current_filepath)
                print(current_filepath + ' done.')

                long_prompt_img = pipeline_text2image(prompt=long_prompt, height=864, width=1152).images[0]
                long_prompt_img = long_prompt_img.resize((640,480), Image.LANCZOS)
                current_filepath = 'imgs/mixed/' + str(img_id) + '-long-' + str(model) + '.jpg'
                long_prompt_img.save(current_filepath)
                print(current_filepath + ' done.')

    # img generation with prompt
    else:
        if model == 'fooocus':
            job = pipeline_text2image(amount_prompt_or_mscocolist)['job_id']
            safe_fooocus_img(job, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '-fooocus.png')
        else:
            img = pipeline_text2image(prompt=amount_prompt_or_mscocolist).images[0] 
            img.save('imgs/mixed/' + datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + '-' + model + '.jpg')

In [None]:
generate_imgs((0, 1060), 'sdxl')

In [None]:
def copy_coco_imgs(origin_directory, destination_directory, imgs):
    for img in imgs:
        ! cp {origin_directory}/{img.zfill(12) + '.jpg'} {destination_directory}
img_set = []
copy_coco_imgs('mscoco-2017/images/val2017', 'img-dataset/coco/', img_set)

## 3. Aufbau der Studiendaten

Die Bilder in der Studie sollen ein möglichst breites Spektrum an verschiedenen Bildinhalten darstellen, um möglichst große externe Validität (Generalisierbarkeit) herzustellen.

Zunächst wird die Zuordnung Bild-ID – Klasse, für alle Bilder im aktuellen COCO-Subdatensatz (alle 640x480 Bilder) erstellt. Da jeder Klasse mehrere Bilder zugeordnet werden können, wird nun für jede Klasse eine Bild-ID zufällig ausgewählt.
Jede Bild-ID ist verbunden mit einem short- und einem long-Prompt.

Da COCO 80 Bildklassen beinhaltet entsteht so eine Liste von 80 Bild-IDs.

In [2]:
def select_random_sample_of_all_classes():
    with open('data.json') as f:
        data = json.load(f)

    # create a dict of all classes represented in the current coco subset (all 640x480 imgs) and their corresponding img-ids
    id_and_class = []
    for img in data.items():
        id_and_class.append((img[0], img[1]['prompt_base']['short']))

    elements_by_class = {}
    for pair in id_and_class:
        if pair[1] in elements_by_class:
            elements_by_class[pair[1]].append(pair)
        else:
            elements_by_class[pair[1]] = []
            elements_by_class[pair[1]].append(pair)

    # select a random img from every class
    id_list = []
    for coco_class in elements_by_class.items():
        chosen_element = random.choice(coco_class[1])
        id_list.append(chosen_element)

    print(id_list)
    return id_list

Da die Studie ebenfalls 80 generierte KI Bilder enthalten soll, kann jedes Bild vollständig unterschiedliche Inhalte haben.
Auf Grundlage der Bild IDs werden nun zufällig 20 Fooocus-SP, 20 Fooocus-LP, 20 SDXL-SP und 20 SDXL-LP Bilder ausgewählt.

Jedes Bild ist nun eines, das auf Grundlage vollständig unterschiedlicher Prompts generiert wurde.
Damit gleicht sich kein Bild inhaltlich.

In [None]:
# selection of ai generated imgs
id_list = select_random_sample_of_all_classes()
random.shuffle(id_list)
for index, img in enumerate(id_list):
    if index in range(0, 40):
        if index in range(0, 20):
            ! cp img-dataset/fooocus-r/{img[0] + '-short-fooocus.png'} img-dataset/study-imgs/
        else:
            ! cp img-dataset/fooocus-r/{img[0] + '-long-fooocus.png'} img-dataset/study-imgs/
    if index in range(40, 81):
        if index in range(40, 60):
            ! cp img-dataset/sdxl/{img[0] + '-short-sdxl.jpg'} img-dataset/study-imgs/
        else:
            ! cp img-dataset/sdxl/{img[0] + '-long-sdxl.jpg'} img-dataset/study-imgs/

In [None]:
# selection of coco imgs
id_list = select_random_sample_of_all_classes()
random.shuffle(id_list)
for index, img in enumerate(id_list):
    if index % 2 == 0:
        ! cp img-dataset/coco/{img[0].zfill(12) + '.jpg'} img-dataset/study/study-imgs-coco/

In [None]:
# splitting imgs into study-group-1 imgs and study-group-2 imgs
coco_imgs = os.listdir('img-dataset/study/study-imgs-coco/')
generated_imgs = os.listdir('img-dataset/study/study-imgs-generated/')

def split_imgs_into_2_groups(imgs, origin):
    for index, img in enumerate(imgs):
        if index % 2 == 0:
            ! cp {origin}/{img} img-dataset/study/study-group-1/
        else:
            ! cp {origin}/{img} img-dataset/study/study-group-2/

random.shuffle(coco_imgs)
split_imgs_into_2_groups(coco_imgs, 'img-dataset/study/study-imgs-coco')

sdxl_short = []
sdxl_long = []
fooocus_short = []
fooocus_long = []

for img in generated_imgs:
    if 'short-sdxl' in img:
        sdxl_short.append(img)
    elif 'long-sdxl' in img:
        sdxl_long.append(img)
    elif 'short-fooocus' in img:
        fooocus_short.append(img)
    elif 'long-fooocus' in img:
        fooocus_long.append(img)

[random.shuffle(imgs) for imgs in (sdxl_short, sdxl_long, fooocus_short, fooocus_long)]
[split_imgs_into_2_groups(imgs, 'img-dataset/study/study-imgs-generated') for imgs in (sdxl_short, sdxl_long, fooocus_short, fooocus_long)]

# resizing every sdxl img with LANCZOS algorithm to match the other imgs size
study_groups = ['img-dataset/study/study-group-1/', 'img-dataset/study/study-group-2/']

for group in study_groups:
    img_name_list = os.listdir(group)
    for img_name in img_name_list:
        if 'sdxl' in img_name:
            img_path = group + img_name
            img = Image.open(img_path)
            resized_img = img.resize((640, 480), Image.LANCZOS)
            resized_img.save(img_path)

In [None]:
# create a random order of the imgs for each poll/ study-group 
def create_random_order(directory):
    files = [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    random.shuffle(files)

    total_digits = len(str(len(files)))

    # rename files with a leading number
    for index, filename in enumerate(files):
        new_filename = f"{str(index + 1).zfill(total_digits)}_{filename}"
        old_file_path = os.path.join(directory, filename)
        new_file_path = os.path.join(directory, new_filename)

        shutil.move(old_file_path, new_file_path)
        print(f"Renamed '{filename}' to '{new_filename}'")

In [None]:
study_groups = ['img-dataset/study/study-group-1/', 'img-dataset/study/study-group-2/']
for group in study_groups:
    create_random_order(group)

In [17]:
# splitting study imgs into areas for the area selection task
overlay = Image.open('img-dataset/study/areas.png')

for group in ['img-dataset/study/study-group-1/', 'img-dataset/study/study-group-2/']:
    files = [f for f in os.listdir(group) if os.path.isfile(os.path.join(group, f))]

    for img_name in files:
        img = Image.open(group + img_name)
        if overlay.mode != img.mode:
            overlay = overlay.convert("RGB")
            img = img.convert("RGB")
        blended_image = Image.blend(overlay, img, .4)
        blended_image.save(group + 'img-areas/' + img_name)

## 4. Testen der Bilder mit dem Detection Model
Nun wird jedes Bild vom Detection Modell üperprüft. Das Ergebnis sind Logit-Werte. Ein positiver Wert bedeutet, dass das Bild als KI-generiert erkannt wurde, ein negativer Wert, dass es als echtes Bild identifiziert wurde.

In [2]:
# list all locations of all files to be checked by the model in a csv file
def list_files(directory):
    file_names = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_names.append(os.path.join(root, file)[5:])
    random.shuffle(file_names)
    return file_names

def write_to_csv(file_names, csv_file):
    with open(csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['src'])
        for file_name in file_names:
            writer.writerow([file_name])

file_names = list_files('imgs')
write_to_csv(file_names, 'img_locations.csv')

In [3]:
# remove all previous model output files
!rm -r model_outputs/*

In [None]:
# check every img in the imgs folder for being ai-generated 
detection_model_script_location = 'detection_model/main.py'
detection_model_weights_location = 'detection_model/weights'

!python {detection_model_script_location} --data_dir imgs/ --out_dir model_outputs --weights_dir {detection_model_weights_location} --csv_file img_locations.csv

# creating detection results csv
with open('model_outputs/mixed/mixed.csv', 'r', newline='') as model_outputs, open('detection_results.csv', 'w', newline='') as detection_results:
    reader = csv.reader(model_outputs)
    writer = csv.writer(detection_results)
    writer.writerow(['img', 'probability'])

    for index, row in enumerate(reader):
        if not index == 0:
            writer.writerow([row[0], round(expit(float(row[-1])), 4)])

In [68]:
def calculate_f1_scores(df):
    df['probability'] = np.where(df['probability'] >= 0.5, 1, 0)

    sp_predictions = df[df['img'].str.contains('short')]['probability'].tolist()
    sp_true_values = [1] * len(sp_predictions)

    lp_predictions = df[df['img'].str.contains('long')]['probability'].tolist()
    lp_true_values = [1] * len(lp_predictions)

    ai_predictions = df[df['img'].str.contains('long|short')]['probability'].tolist()
    ai_true_values = [1] * len(ai_predictions)

    real_predictions = df[~df['img'].str.contains('long|short')]['probability'].tolist()

    total_predictions = ai_predictions + real_predictions

    real_predictions = [1 - x for x in real_predictions]
    real_true_values = [1] * len(real_predictions)

    total_true_values = ai_true_values + [0] * len(real_predictions)

    sdxl_predictions = df[df['img'].str.contains('sdxl')]['probability'].tolist()
    sdxl_true_values = [1] * len(sdxl_predictions)

    fooocus_predictions = df[df['img'].str.contains('fooocus')]['probability'].tolist()
    fooocus_true_values = [1] * len(fooocus_predictions)


    f1_sp = f1_score(sp_true_values, sp_predictions)
    f1_lp = f1_score(lp_true_values, lp_predictions)
    f1_real = f1_score(real_true_values, real_predictions)
    f1_sdxl = f1_score(sdxl_true_values, sdxl_predictions)
    f1_fooocus = f1_score(fooocus_true_values, fooocus_predictions)

    f1_total = f1_score (total_true_values, total_predictions)
    f1_ai = f1_score(ai_true_values, ai_predictions)
    acc_ai = accuracy_score(ai_true_values, ai_predictions)
    return [('f1-total', f1_total), ('f1-ai', f1_ai), ('f1-sp', f1_sp), ('f1-lp', f1_lp), ('acc-ai', acc_ai), ('f1-real', f1_real), ('f1-sdxl', f1_sdxl), ('f1-fooocus', f1_fooocus)]

In [None]:
df = pd.read_csv('detection_results-samesized_imgs.csv')
print(f'Detection model F1-scores for all study imgs (same sized): {calculate_f1_scores(df)}')

df = pd.read_csv('detection_results-original_imgs.csv')
print(f'Detection model F1-scores for all study imgs (originals): {calculate_f1_scores(df)}')

## 5. Auswertung der Umfrageergebnisse

In [4]:
def results_evaluation(imgs_path, answers_path):
    df = pd.read_csv(answers_path)
    demographics = df.iloc[:, :7]
    answers = df.iloc[:, 7:]

    # refactoring demographics data
    demographics.columns.values[1] = 'Alter'
    demographics.columns.values[2] = 'Geschlecht'
    demographics.columns.values[3] = 'Bildungsabschluss'
    demographics = demographics.drop(demographics.columns[[0, 6]], axis=1)

    # building solutions for the current study group
    solutions = []
    imgs = os.listdir(imgs_path)
    imgs.sort()
    for img in imgs:
        if os.path.isfile(os.path.join(imgs_path, img)) and ('.jpg' in img or '.png' in img):
            if 'long' in img:
                prompt = 'long'
                generated = 1
            elif 'short' in img:
                prompt = 'short'
                generated = 1
            else:
                prompt = None
                generator = None
                generated = 0
            if 'sdxl' in img:
                generator = 'sdxl'
            elif 'fooocus' in img:
                generator = 'fooocus'

            solutions.append({
                'generated': generated, 
                'prompt': prompt,
                'generator': generator
            })

    # splitting the answers into different categories
    classifications = df[[col for col in answers.columns if re.match(re.compile(r'^\[Q\d{1,2}\]$'), col)]]
    certainties = df[[col for col in answers.columns if re.match(re.compile(r'^\[Q\d{1,2}\] Wie sicher bist du dir\?$'), col)]]
    area_selections = df[[col for col in answers.columns if re.match(re.compile(r'(^\[Q\d{1,2}\] (Gibt es einen Bildbereich an dem du deine Entscheidung festmachst\?|Welche Bildbereiche haben deine Entscheidung bestimmt\?)|^\[Q\d{1,2}\]$)'), col)]]
    
    # calculating f1-scores for each category (sp, lp, real) for all participants
    for index, row in classifications.iterrows():
        for column in classifications.columns:
            if classifications.at[index, column] == 'KI generiert':
                classifications.at[index, column] = 1
            elif classifications.at[index, column] == 'echt':
                classifications.at[index, column] = 0
    
    classifications_lists = []
    for index, row in classifications.iterrows():
        classifications_lists.append(row.tolist())
    
    certainty_lists = []
    for index, row in certainties.iterrows():
        certainty_lists.append(row.tolist())
    
    def calc_f1_scores(classification_list):
        lp_predictions = []
        lp_true_values = []
        sp_predictions = []
        sp_true_values = []
        ai_predictions = []
        ai_true_values = []
        real_predictions = []
        real_true_values = []
        fooocus_predictions = []
        fooocus_true_values = []
        sdxl_predictions = []
        sdxl_true_values = []
        for index, solution in enumerate(solutions):
            if solution['prompt'] == 'long':
                lp_predictions.append(classification_list[index])
                lp_true_values.append(solution['generated'])
                ai_predictions.append(classification_list[index])
                ai_true_values.append(solution['generated'])
            elif solution['prompt'] == 'short':
                sp_predictions.append(classification_list[index])
                sp_true_values.append(solution['generated'])
                ai_predictions.append(classification_list[index])
                ai_true_values.append(solution['generated'])
            elif solution['prompt'] is None:
                real_predictions.append(classification_list[index])
                real_true_values.append(solution['generated'])
            if solution['generator'] == 'fooocus':
                fooocus_predictions.append(classification_list[index])
                fooocus_true_values.append(solution['generated'])
            elif solution['generator'] == 'sdxl':
                sdxl_predictions.append(classification_list[index])
                sdxl_true_values.append(solution['generated'])

        total_true_values = ai_true_values + real_true_values
        total_predections = ai_predictions + real_predictions
        
        # inverse lists of classification results and solutions for real imgs to make f1-score calculation possible
        real_true_values = [1 - x for x in real_true_values]
        real_predictions = [1 - x for x in real_predictions]

        f1_sp = f1_score(sp_true_values, sp_predictions)
        f1_lp = f1_score(lp_true_values, lp_predictions)
        f1_real = f1_score(real_true_values, real_predictions)
        f1_ai = f1_score(ai_true_values, ai_predictions)
        f1_fooocus = f1_score(fooocus_true_values, fooocus_predictions)
        f1_sdxl = f1_score(sdxl_true_values, sdxl_predictions)
        f1_total = f1_score(total_true_values, total_predections)
        return {'f1-total': f1_total, 'f1-real': f1_real, 'f1-ai': f1_ai, 'f1-sp': f1_sp, 'f1-lp': f1_lp, 'f1-fooocus': f1_fooocus, 'f1-sdxl': f1_sdxl}
    
    def calc_accuracy_and_recalls(classification_list):
        predictions = []
        true_values = []
        for index, solution in enumerate(solutions):
            predictions.append(classification_list[index])
            true_values.append(solution['generated'])
        accuracy = accuracy_score(true_values, predictions)
        recalls = recall_score(true_values, predictions, average=None)
        return accuracy, recalls
    
    def calc_certainties(certainty_list):
        certainty_real = []
        certainty_ai = []
        certainty_sp = []
        certainty_lp = []
        for index, solution in enumerate(solutions):
            if solution['prompt'] == 'long':
                certainty_lp.append(certainty_list[index])
                certainty_ai.append(certainty_list[index])
            elif solution['prompt'] == 'short':
                certainty_sp.append(certainty_list[index])
                certainty_ai.append(certainty_list[index])
            elif solution['prompt'] is None:
                certainty_real.append(certainty_list[index])
        return {'certainty-real': pd.Series(certainty_real).mean(), 'certainty-ai': pd.Series(certainty_ai).mean(), 'certainty-sp': pd.Series(certainty_sp).mean(), 'certainty-lp': pd.Series(certainty_lp).mean()}


    metrics = {
        'f1-total': [],
        'f1-real': [],
        'f1-ai': [],
        'f1-sp': [],
        'f1-lp': [],
        'f1-sdxl': [],
        'f1-fooocus': [],
        'accuracy': [],
        'recall-real': [],
        'recall-ai': [],
        'certainty-real': [],
        'certainty-ai': [],
        'certainty-sp': [],
        'certainty-lp': []
    }
    for classification_list in classifications_lists:
        current_f1s = calc_f1_scores(classification_list)
        current_accuracy, recalls = calc_accuracy_and_recalls(classification_list)
        metrics['f1-total'].append(current_f1s['f1-total'])
        metrics['f1-real'].append(current_f1s['f1-real'])
        metrics['f1-ai'].append(current_f1s['f1-ai'])
        metrics['f1-sp'].append(current_f1s['f1-sp'])
        metrics['f1-lp'].append(current_f1s['f1-lp'])
        metrics['f1-fooocus'].append(current_f1s['f1-fooocus'])
        metrics['f1-sdxl'].append(current_f1s['f1-sdxl'])
        metrics['accuracy'].append(current_accuracy)
        metrics['recall-real'].append(recalls[0])
        metrics['recall-ai'].append(recalls[1])
    
    for certainty_list in certainty_lists:
        current_certainties = calc_certainties(certainty_list)
        metrics['certainty-real'].append(current_certainties['certainty-real'])
        metrics['certainty-ai'].append(current_certainties['certainty-ai'])
        metrics['certainty-sp'].append(current_certainties['certainty-sp'])
        metrics['certainty-lp'].append(current_certainties['certainty-lp'])

    metrics = pd.DataFrame(metrics)
    f1_scores_means = (metrics['f1-total'].mean(), metrics['f1-real'].mean(), metrics['f1-ai'].mean(), metrics['f1-sp'].mean(), metrics['f1-lp'].mean())
    print(f1_scores_means)
    return metrics, demographics, certainties, area_selections, solutions

In [None]:
metrics_g1, demographics_g1, certainties_g1, area_selections_g1, solutions_g1 = results_evaluation('img-dataset/study/study-group-1/imgs/', 'study-responses/BA Gruppe 1.csv')
metrics_g2, demographics_g2, certainties_g2, area_selections_g2, solutions_g2 = results_evaluation('img-dataset/study/study-group-2/imgs/', 'study-responses/BA Gruppe 2.csv')
metrics = pd.concat([metrics_g1, metrics_g2])
demographics = pd.concat([demographics_g1, demographics_g2])
certainties = pd.concat([certainties_g1, certainties_g2])
area_selections = pd.concat([area_selections_g1, area_selections_g2])

f1s = metrics.drop(columns=['accuracy', 'recall-real', 'recall-ai', 'certainty-real', 'certainty-ai', 'certainty-sp', 'certainty-lp'])

### 5.1 Demographiedaten auswerten

In [None]:
demographics

In [None]:
# Häufigkeiten der Altersgruppen
bins = range(0, max(demographics['Alter']) + 5, 5)
plt.figure(figsize=(10, 5))
plt.hist(demographics['Alter'], bins=bins, edgecolor='black')
plt.title('Wie alt bist du?', fontsize=18)
plt.xlabel('Alter', fontsize=16)
plt.ylabel('Häufigkeit', fontsize=16)
plt.grid(True)
plt.xticks(bins, fontsize=14)
plt.yticks(fontsize=14)
plt.savefig('statistics/imgs/age_histogram.jpg')
plt.show()

In [None]:
# Häufigkeiten der Geschlechter
geschlecht_count = demographics['Geschlecht'].value_counts()
plt.figure(figsize=(10, 5))
geschlecht_count.plot(kind='bar')
plt.title('Welchem Geschlecht fühlst du dich zugehörig?', fontsize=18)
plt.xlabel('')
plt.ylabel('Häufigkeit', fontsize=16)
plt.grid(True)
plt.xticks(rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.savefig('statistics/imgs/gender.jpg')
plt.show()

In [None]:
# Häufigkeiten der Bildungsabschlüsse
abschluss_count = demographics['Bildungsabschluss'].value_counts()
reihenfolge = ['Hauptschulabschluss', 'Realschulabschluss', 'Abitur', 'Bachelor', 'Master', 'Promotion', 'Keiner der oben genannten']
ordered_count = pd.Series(index=reihenfolge, data={name: abschluss_count.get(name, 0) for name in reihenfolge})
plt.figure(figsize=(10, 5))
ordered_count.plot(kind='bar')
plt.title('Was ist dein höchster Bildungsabschluss?', fontsize=18)
#plt.xlabel('Bildungsabschluss', fontsize=14)
plt.ylabel('Häufigkeit', fontsize=16)
plt.grid(True)
labels = ['Hauptschul-\nabschluss', 'Realschul-\nabschluss', 'Abitur', 'Bachelor', 'Master', 'Promotion', 'Keiner der\ngenannten']
plt.xticks(ticks=range(len(reihenfolge)), labels=labels, rotation=0, fontsize=14)
plt.yticks(fontsize=14)
plt.savefig('statistics/imgs/education.jpg')
plt.show()

In [None]:
# Häufigkeiten der Betrachtungserfahrung von KI-Bildern
frequency = demographics.iloc[:, 3].value_counts()
reihenfolge = ['Nein. Ich habe noch nie KI Bilder bewusst gesehen.', 'Ich habe schon einmal KI Bilder gesehen.', 'Ich sehe KI Bilder hin und wieder.', 'Ich sehe KI Bilder regelmäßig.']
frequency = pd.Series(index=reihenfolge, data={name: frequency.get(name, 0) for name in reihenfolge})
plt.figure(figsize=(10, 5))
frequency.plot(kind='bar')
plt.title('Hast du schon KI-generierte Bilder gesehen?', fontsize=18)
#plt.xlabel('Antworten', fontsize=14)
plt.ylabel('Häufigkeit', fontsize=16)
wrapped_answers = [
    'Nein. Ich habe\n noch nie KI Bilder\n bewusst gesehen.', 
    'Ich habe schon\n einmal KI Bilder\n gesehen.', 
    'Ich sehe KI Bilder\nhin und wieder.', 
    'Ich sehe KI Bilder\nregelmäßig.'
]
plt.xticks(range(len(wrapped_answers)), wrapped_answers, rotation=0, ha='center', fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('statistics/imgs/watch_experience.jpg')
plt.show()

In [None]:
# Häufigkeiten der Generierungserfahrung von KI-Bildern
frequency = demographics.iloc[:, 4].value_counts()
reihenfolge = ['Nein, noch nie.', 'Ja, schon einmal.', 'Ja, hin und wieder.', 'Ja, regelmäßig.']
frequency = pd.Series(index=reihenfolge, data={name: frequency.get(name, 0) for name in reihenfolge})
plt.figure(figsize=(10, 5))
frequency.plot(kind='bar')
plt.title('Hast du schon selbst KI Bilder generiert?', fontsize=18)
#plt.xlabel('Antworten')
plt.ylabel('Häufigkeit', fontsize=16)
wrapped_answers = [
    'Nein, noch nie', 
    'Ja, schon einmal', 
    'Ja, hin und wieder', 
    'Ja, regelmäßig'
]
plt.xticks(range(len(wrapped_answers)), wrapped_answers, rotation=0, ha='center', fontsize=14)
plt.yticks(fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig('statistics/imgs/generation_experience.jpg')
plt.show()

### 5.2 Einfluss der Erfahrung auf die Erkennungsleistung

In [7]:
# Frage beantworten "Hängt die Erkennungsleistung von der Erfahrung ab?"
# experience_type ist der index der Spalte in der demographics Tabelle, die die entsprechende Erfahrung mit KI-Bildern abfragt
def f1_scores_per_experience_type(experience_type):
    group_1 = {'f1-lp': [], 'f1-sp': [], 'f1-ai': [], 'f1-real': [], 'f1-total': []}
    group_2 = {'f1-lp': [], 'f1-sp': [], 'f1-ai': [], 'f1-real': [], 'f1-total': []}
    group_3 = {'f1-lp': [], 'f1-sp': [], 'f1-ai': [], 'f1-real': [], 'f1-total': []}
    group_4 = {'f1-lp': [], 'f1-sp': [], 'f1-ai': [], 'f1-real': [], 'f1-total': []}

    answers = []
    if experience_type == 3:
        answers = ['Nein. Ich habe noch nie KI Bilder bewusst gesehen.', 'Ich habe schon einmal KI Bilder gesehen.', 'Ich sehe KI Bilder hin und wieder.', 'Ich sehe KI Bilder regelmäßig.']
    elif experience_type == 4:
        answers = ['Nein, noch nie.', 'Ja, schon einmal.', 'Ja, hin und wieder.', 'Ja, regelmäßig.']

    for index, participant in enumerate(demographics.iterrows()):
        if participant[1][experience_type] == answers[0]:
            group_1['f1-lp'].append(f1s['f1-lp'].iloc[index])
            group_1['f1-sp'].append(f1s['f1-sp'].iloc[index])
            group_1['f1-ai'].append(f1s['f1-ai'].iloc[index])
            group_1['f1-real'].append(f1s['f1-real'].iloc[index])
            group_1['f1-total'].append(f1s['f1-total'].iloc[index])
        elif participant[1][experience_type] == answers[1]:
            group_2['f1-lp'].append(f1s['f1-lp'].iloc[index])
            group_2['f1-sp'].append(f1s['f1-sp'].iloc[index])
            group_2['f1-ai'].append(f1s['f1-ai'].iloc[index])
            group_2['f1-real'].append(f1s['f1-real'].iloc[index])
            group_2['f1-total'].append(f1s['f1-total'].iloc[index])
        elif participant[1][experience_type] == answers[2]:
            group_3['f1-lp'].append(f1s['f1-lp'].iloc[index])
            group_3['f1-sp'].append(f1s['f1-sp'].iloc[index])
            group_3['f1-ai'].append(f1s['f1-ai'].iloc[index])
            group_3['f1-real'].append(f1s['f1-real'].iloc[index])
            group_3['f1-total'].append(f1s['f1-total'].iloc[index])
        elif participant[1][experience_type] == answers[3]:
            group_4['f1-lp'].append(f1s['f1-lp'].iloc[index])
            group_4['f1-sp'].append(f1s['f1-sp'].iloc[index])
            group_4['f1-ai'].append(f1s['f1-ai'].iloc[index])
            group_4['f1-real'].append(f1s['f1-real'].iloc[index])
            group_4['f1-total'].append(f1s['f1-total'].iloc[index])

    group_1 = pd.DataFrame(group_1)
    group_2 = pd.DataFrame(group_2)
    group_3 = pd.DataFrame(group_3)
    group_4 = pd.DataFrame(group_4)

    #print(f"Gruppe 'Nein. Ich habe noch nie KI Bilder bewusst gesehen.': {group_1['f1-lp'].mean(), group_1['f1-sp'].mean(), group_1['f1-ai'].mean(), group_1['f1-real'].mean()}")
    #print(f"Gruppe 'Ich habe schon einmal KI Bilder gesehen.': {group_2['f1-lp'].mean(), group_2['f1-sp'].mean(), group_2['f1-ai'].mean(), group_2['f1-real'].mean()}")
    #print(f"Gruppe 'Ich sehe KI Bilder hin und wieder.': {group_3['f1-lp'].mean(), group_3['f1-sp'].mean(), group_3['f1-ai'].mean(), group_3['f1-real'].mean()}")
    #print(f"Gruppe 'Ich sehe KI Bilder regelmäßig.': {group_4['f1-lp'].mean(), group_4['f1-sp'].mean(), group_4['f1-ai'].mean(), group_4['f1-real'].mean()}")

    return group_1, group_2, group_3, group_4

# custom plots
def f1_vs_watch_experience(f1_scores, f1_type):
    plt.figure(figsize=(10, 6))
    for i, scores in enumerate(f1_scores):
        x = [i+1 + np.random.uniform(-0.15, 0.15) for _ in range(len(scores))]
        y = scores
        plt.scatter(x, y, s=20)

        mean_score = np.mean(scores)
        plt.plot([i + 1 - 0.2, i + 1 + 0.2], [mean_score, mean_score], color='black', linewidth=1.5)
        plt.text(i + 1, mean_score + 0.01, f'$\\bar{{x}}$ = {mean_score:.4f}', ha='center', va='bottom', fontsize=10,
                 bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'))

    plt.ylabel(f1_type)
    plt.title(f'{f1_type}: Hast du schon KI generierte Bilder gesehen?')
    plt.xticks(range(1, 5), ['Nein. Ich habe\nnoch nie KI Bilder\nbewusst gesehen.\nn = {}'.format(len(f1_scores[0])),
                             'Ich habe schon\neinmal KI Bilder\ngesehen.\nn = {}'.format(len(f1_scores[1])),
                             'Ich sehe KI Bilder\nhin und wieder.\nn = {}'.format(len(f1_scores[2])),
                             'Ich sehe KI Bilder\nregelmäßig.\nn = {}'.format(len(f1_scores[3]))])
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'statistics/imgs/watch_experience_vs_{f1_type}.jpg')
    plt.show()

def f1_vs_generation_experience(f1_scores, f1_type):
    plt.figure(figsize=(10, 6))
    for i, scores in enumerate(f1_scores):
        x = [i+1 + np.random.uniform(-0.15, 0.15) for _ in range(len(scores))]
        y = scores
        plt.scatter(x, y, s=20)

        mean_score = np.mean(scores)
        plt.plot([i + 1 - 0.2, i + 1 + 0.2], [mean_score, mean_score], color='black', linewidth=1.5)
        plt.text(i + 1, mean_score + 0.01, f'$\\bar{{x}}$ = {mean_score:.4f}', ha='center', va='bottom', fontsize=10,
                 bbox=dict(facecolor='white', alpha=0.5, edgecolor='none'))

    plt.ylabel(f1_type)
    plt.title(f'{f1_type}: Hast du schon selbst KI Bilder generiert?')
    plt.xticks(range(1, 5), ['Nein, noch nie.\nn = {}'.format(len(f1_scores[0])),
                             'Ja, schon einmal.\nn = {}'.format(len(f1_scores[1])),
                             'Ja, hin und wieder.\nn = {}'.format(len(f1_scores[2])),
                             'Ja, regelmäßig.\nn = {}'.format(len(f1_scores[3]))])
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'statistics/imgs/generation_experience_vs_{f1_type}.jpg')
    plt.show()

In [8]:
# boxplots
def f1_vs_watch_experience(f1_scores, f1_type):
    plt.figure(figsize=(10, 6))
    plt.boxplot(f1_scores, showmeans=True)
    plt.ylabel(f1_type.upper(), fontsize=16)
    plt.title(f'Hast du schon KI generierte Bilder gesehen?', fontsize=20)
    plt.xticks(range(1, 5), ['Nein. Ich habe\nnoch nie KI Bilder\nbewusst gesehen.\nn = {}'.format(len(f1_scores[0])),
                             'Ich habe schon\neinmal KI Bilder\ngesehen.\nn = {}'.format(len(f1_scores[1])),
                             'Ich sehe KI Bilder\nhin und wieder.\nn = {}'.format(len(f1_scores[2])),
                             'Ich sehe KI Bilder\nregelmäßig.\nn = {}'.format(len(f1_scores[3]))], fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'statistics/imgs/watch_experience_vs_{f1_type}.jpg')
    plt.show()

def f1_vs_generation_experience(f1_scores, f1_type):
    plt.figure(figsize=(10, 6))
    plt.boxplot(f1_scores, showmeans=True)
    plt.ylabel(f1_type.upper(), fontsize=16)
    plt.title(f'Hast du schon selbst KI Bilder generiert?', fontsize=20)
    plt.xticks(range(1, 5), ['Nein, noch nie.\nn = {}'.format(len(f1_scores[0])),
                             'Ja, schon einmal.\nn = {}'.format(len(f1_scores[1])),
                             'Ja, hin und wieder.\nn = {}'.format(len(f1_scores[2])),
                             'Ja, regelmäßig.\nn = {}'.format(len(f1_scores[3]))], fontsize=14)
    plt.yticks(fontsize=14)
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f'statistics/imgs/generation_experience_vs_{f1_type}.jpg')
    plt.show()

In [None]:
group_1, group_2, group_3, group_4 = f1_scores_per_experience_type(4)

In [None]:
median_f1_total = group_4['f1-total'].median()
mean_f1_total = group_4['f1-total'].mean()
Q1 = group_4['f1-total'].quantile(0.25)
Q3 = group_4['f1-total'].quantile(0.75)
print(f"Median: {median_f1_total}")
print(f"Mittelwert: {mean_f1_total}")
print(f"Interquartilsintervall [Q1; Q3]: [{Q1}; {Q3}]")

In [None]:
# f1-scores in Abhängigkeit von der Seherfahrung mit KI-Bildern
group_1, group_2, group_3, group_4 = f1_scores_per_experience_type(3)
for group in [group_1, group_2, group_3, group_4]:
    group['f1-mean'] = group[['f1-ai', 'f1-real']].mean(axis=1)

for f1 in group_1.columns:
    f1_scores = [group_1[f1], group_2[f1], group_3[f1], group_4[f1]]
    f1_vs_watch_experience(f1_scores, f1)

In [None]:
# f1-scores in Abhängigkeit von der Generierungserfahrung mit KI-Bildern
group_1, group_2, group_3, group_4 = f1_scores_per_experience_type(4)
for group in [group_1, group_2, group_3, group_4]:
    group['f1-mean'] = group[['f1-ai', 'f1-real']].mean(axis=1)

for f1 in group_1.columns:
    f1_scores = [group_1[f1], group_2[f1], group_3[f1], group_4[f1]]
    f1_vs_generation_experience(f1_scores, f1)

### 5.3 Einfluss der Promptlänge auf die Erkennungsleistung

In [None]:
# f1-score-Verteilung je Promptlänge
plt.figure(figsize=(8, 6))
plt.boxplot([f1s['f1-sp'], f1s['f1-lp']], labels=['f1-sp', 'f1-lp'], showmeans=True, medianprops=dict(color="orange"))
plt.title('F1-Scores nach Promptlänge', fontsize=16)
plt.ylabel('F1-AI', fontsize=14)
plt.ylim(0.38, 1.02) 
plt.grid(True)
plt.xticks(range(1, 3), ['SP-Bilder', 'LP-Bilder'], fontsize=12)
plt.yticks(fontsize=12)
plt.savefig(f'statistics/imgs/f1-vs-prompt.jpg')
plt.show()

In [None]:
f1_sp = f1s['f1-sp']
f1_lp = f1s['f1-lp']

# Berechnung der Kennzahlen für f1-sp
q1_sp = np.percentile(f1_sp, 25)
median_sp = np.median(f1_sp)
q3_sp = np.percentile(f1_sp, 75)
iqr_sp = q3_sp - q1_sp
mean_sp = np.mean(f1_sp)

# Berechnung der Kennzahlen für f1-lp
q1_lp = np.percentile(f1_lp, 25)
median_lp = np.median(f1_lp)
q3_lp = np.percentile(f1_lp, 75)
iqr_lp = q3_lp - q1_lp
mean_lp = np.mean(f1_lp)

# Ausgabe der Kennzahlen
print("Kennzahlen für f1-sp:")
print(f"Interquartilsintervall (Q1 - Q3): {q1_sp:.4f} - {q3_sp:4f}")
print(f"Mittelwert: {mean_sp:.4f}")
print(f"Median: {median_sp:.4f}")
print(f"IQR: {iqr_sp:.4f}")
print("\nKennzahlen für f1-lp:")
print(f"Interquartilsintervall (Q1 - Q3): {q1_lp:.4f} - {q3_lp:.4f}")
print(f"Mittelwert: {mean_lp:.4f}")
print(f"Median: {median_lp:.4f}")
print(f"IQR: {iqr_lp:.4f}")

In [None]:
# check data for gaussian distribution
diffs = f1s['f1-lp'] - f1s['f1-sp']
shapiro_statistic, shapiro_p_value = stats.shapiro(diffs)

print("Shapiro-Wilk Test Statistic:", shapiro_statistic)
print("P-Value:", shapiro_p_value)

In [None]:
wilcoxon_result = stats.wilcoxon(f1s['f1-lp'], f1s['f1-sp'], alternative='greater')
print("statistic = {:.2f}, p-value = {:.4e}".format(wilcoxon_result.statistic, wilcoxon_result.pvalue))

In [None]:
# calculating effect size
diffs = f1s['f1-lp'] - f1s['f1-sp']
mean_diff = np.mean(diffs)
std_diff = np.std(diffs, ddof=1)
d = mean_diff / std_diff
print(d)

### 5.4 Einfluss der Entscheidungssicherheit auf die Erkennungsleistung

In [None]:
for category in ['real', 'ai', 'sp', 'lp']:
    plt.figure(figsize=(10, 6))
    plt.plot(metrics['certainty-' + category], metrics['f1-' + category], 'bo')
    plt.title(f'F1-{category.upper()}-Score über Sicherheit', fontsize=20)
    plt.xlabel('Mittlere Sicherheit', fontsize=16)
    plt.ylabel('F1-' + category.upper(), fontsize=16)
    plt.ylim(0.38, 1.02) 
    plt.xlim(0.8, 5.2) 
    plt.grid(True)
    plt.xticks(fontsize=14)
    plt.yticks(fontsize=14)
    plt.savefig(f'statistics/imgs/f1-vs-certainty-mean-{category}.jpg')
    plt.show()

In [None]:
# falls die Sicherheit als Median je Proband gespeichert ist
def f1_score_per_certainty_level(certainty_type):
    group_1 = []
    group_2 = []
    group_3 = []
    group_4 = []
    group_5 = []

    for index, participant in enumerate(metrics.iterrows()):
        if participant[1][f'certainty-{certainty_type}'] == 1:
            group_1.append(f1s[f'f1-{certainty_type}'].iloc[index])
        elif participant[1][f'certainty-{certainty_type}'] == 2:
            group_2.append(f1s[f'f1-{certainty_type}'].iloc[index])
        elif participant[1][f'certainty-{certainty_type}'] == 3:
            group_3.append(f1s[f'f1-{certainty_type}'].iloc[index])
        elif participant[1][f'certainty-{certainty_type}'] == 4:
            group_4.append(f1s[f'f1-{certainty_type}'].iloc[index])
        elif participant[1][f'certainty-{certainty_type}'] == 5:
            group_5.append(f1s[f'f1-{certainty_type}'].iloc[index])

    group_1 = pd.Series(group_1)
    group_2 = pd.Series(group_2)
    group_3 = pd.Series(group_3)
    group_4 = pd.Series(group_4)
    group_5 = pd.Series(group_5)

    return group_1, group_2, group_3, group_4, group_5

def f1_vs_certainty(f1_scores, f1_type, sample_sizes):
    plt.figure(figsize=(10, 6))
    plt.boxplot(f1_scores, showmeans=True)
    plt.ylabel(f1_type)
    plt.title(f'{f1_type}: Wie sicher bist du dir?')
    plt.grid(True)
    labels = [f'{i}\n(n={size})' for i, size in zip(["1", "2", "3", "4", "5"], sample_sizes)]
    plt.xticks(ticks=range(1, len(labels) + 1), labels=labels)
    
    plt.tight_layout()
    plt.savefig(f'statistics/imgs/f1-vs-certainty-median-{f1_type[3:]}.jpg')
    plt.show()

for category in ['real', 'ai', 'sp', 'lp']:
    groups = f1_score_per_certainty_level(category)
    f1_scores = [group for group in groups]
    sample_sizes = [len(group) for group in groups] 
    f1_vs_certainty(f1_scores, f'f1-{category}', sample_sizes)

### 5.5 Einfluss des Bildursprungs auf die Erkennungsleistung

In [None]:

plt.figure(figsize=(8, 6))
metrics.boxplot(column=["f1-fooocus", "f1-sdxl", "f1-real"], showmeans=True, medianprops=dict(color="orange"), color="black")
plt.title("F1-Scores nach Bildursprung", fontsize=16)

# Linke y-Achsenbeschriftung
plt.ylabel("F1-AI", fontsize=14)

plt.xlabel("", fontsize=14)
plt.xticks([1, 2, 3], ["Fooocus", "SDXL", "COCO"], fontsize=12)
plt.ylim(0.38, 1.02)
plt.grid(True)

# Vertikale Linie zwischen SDXL und COCO hinzufügen
plt.axvline(x=2.5, color='grey', linestyle='--')

# Zusätzliche y-Achsenbeschriftung auf der rechten Seite
ax = plt.gca()
ax2 = ax.twinx()  # Erstellt eine zweite Achse, die die gleiche x-Achse teilt
ax2.set_ylabel("F1-REAL", fontsize=14)

# y-Achsenlimits synchronisieren
ax2.set_ylim(ax.get_ylim())

plt.savefig('statistics/imgs/f1-vs-origin.jpg')
plt.show()

In [None]:
f1_fooocus = metrics['f1-fooocus']
f1_sdxl = metrics['f1-sdxl']
f1_real = metrics['f1-real']

# Berechnung der Kennzahlen für f1-fooocus
q1_fooocus = np.percentile(f1_fooocus, 25)
median_fooocus = np.median(f1_fooocus)
q3_fooocus = np.percentile(f1_fooocus, 75)
iqr_fooocus = q3_fooocus - q1_fooocus
mean_fooocus = np.mean(f1_fooocus)

# Berechnung der Kennzahlen für f1-sdxl
q1_sdxl = np.percentile(f1_sdxl, 25)
median_sdxl = np.median(f1_sdxl)
q3_sdxl = np.percentile(f1_sdxl, 75)
iqr_sdxl = q3_sdxl - q1_sdxl
mean_sdxl = np.mean(f1_sdxl)

# Berechnung der Kennzahlen für f1-real
q1_real = np.percentile(f1_real, 25)
median_real = np.median(f1_real)
q3_real = np.percentile(f1_real, 75)
iqr_real = q3_real - q1_real
mean_real = np.mean(f1_real)

# Ausgabe der Kennzahlen
print("Kennzahlen für f1-fooocus:")
print(f"Interquartilsintervall (Q1 - Q3): {q1_fooocus:.4f} - {q3_fooocus:.4f}")
print(f"Mittelwert: {mean_fooocus:.4f}")
print(f"Median: {median_fooocus:.4f}")
print(f"IQR: {iqr_fooocus:.4f}")

print("\nKennzahlen für f1-sdxl:")
print(f"Interquartilsintervall (Q1 - Q3): {q1_sdxl:.4f} - {q3_sdxl:.4f}")
print(f"Mittelwert: {mean_sdxl:.4f}")
print(f"Median: {median_sdxl:.4f}")
print(f"IQR: {iqr_sdxl:.4f}")

print("\nKennzahlen für f1-real:")
print(f"Interquartilsintervall (Q1 - Q3): {q1_real:.4f} - {q3_real:.4f}")
print(f"Mittelwert: {mean_real:.4f}")
print(f"Median: {median_real:.4f}")
print(f"IQR: {iqr_real:.4f}")

In [None]:
# check data for gaussian distribution
diffs = f1s['f1-sdxl'] - f1s['f1-fooocus']
shapiro_statistic, shapiro_p_value = stats.shapiro(diffs)

print("Shapiro-Wilk Test Statistic:", shapiro_statistic)
print("P-Value:", shapiro_p_value)

In [None]:
wilcoxon_result = stats.wilcoxon(f1s['f1-sdxl'], f1s['f1-fooocus'], alternative='greater')
print("statistic = {:.2f}, p-value = {:.4e}".format(wilcoxon_result.statistic, wilcoxon_result.pvalue))

In [None]:
# calculating effect size
diffs = f1s['f1-sdxl'] - f1s['f1-fooocus']
mean_diff = np.mean(diffs)
std_diff = np.std(diffs, ddof=1)
d = mean_diff / std_diff
print(d)

### 5.6 Einfluss der Bildbereiche auf die Entscheidung

In [None]:
area_selections

In [5]:
area_selections_by_img_group_1 = []
area_selections_by_img_group_2 = []
for i in range(0, len(area_selections.columns), 5):
    subset = area_selections.iloc[:, i:i+5]
    subset1 = subset.iloc[:100, 0:]
    subset2 = subset.iloc[100:, 0:]
    area_selections_by_img_group_1.append(subset1)
    area_selections_by_img_group_2.append(subset2)

In [10]:
directory_1 = "img-dataset/study/study-group-1/imgs/"
directory_2 = "img-dataset/study/study-group-2/imgs/"
file_paths_1 = [os.path.join(directory_1, filename) for filename in os.listdir(directory_1) if os.path.isfile(os.path.join(directory_1, filename))]
file_paths_1.sort()

file_paths_2 = [os.path.join(directory_2, filename) for filename in os.listdir(directory_2) if os.path.isfile(os.path.join(directory_2, filename))]
file_paths_2.sort()

img_paths_1 = file_paths_1[0:]
img_paths_2 = file_paths_2[0:]

### 5.6.1 Gesamtheatmaps erstellen

In [None]:
area_selections_by_img_group_1[0]

In [None]:
for df in area_selections_by_img_group_1:
    df.iloc[:, 0] = df.iloc[:, 0].replace({'KI generiert': True, 'echt': False})

for df in area_selections_by_img_group_2:
    df.iloc[:, 0] = df.iloc[:, 0].replace({'KI generiert': True, 'echt': False})

In [78]:
def extract_from_end_until_underscore(input_string):
    last_underscore_index = input_string.rfind('_')
    if last_underscore_index != -1:
        result = input_string[last_underscore_index + 1:]
    else:
        result = input_string
    return result

def create_heatmap(df, img_number, image_path, group):
    img = mpimg.imread(image_path)

    counts_ai = np.zeros((3, 3))
    counts_real = np.zeros((3, 3))
    
    valid_decisions_ai = 0
    valid_decisions_real = 0
    
    df_decisions = df.iloc[:, 2:5]
    df_truth_value = df.iloc[:, 0]

    for index, row in df_decisions.iterrows():
        if not row.isna().all():
            if df_truth_value[index] == True:
                valid_decisions_ai += 1
            else:
                valid_decisions_real += 1
            for row_idx in range(3):
                column_value = row[row_idx]
                if pd.isna(column_value):
                    continue
                if ',' in column_value:
                    selections = column_value.split(', ')
                else:
                    selections = [column_value]
                for selection in selections:
                    col_idx = ord(selection) - ord('A')
                    if df_truth_value[index] == True:
                        counts_ai[row_idx, col_idx] += 1
                    else:
                        counts_real[row_idx, col_idx] += 1

    fig, axes = plt.subplots(1, 2, figsize=(12.8, 4.8))

    ax_ai = axes[0]
    ax_ai.imshow(img, extent=[0, 640, 480, 0])
    cmap_ai = LinearSegmentedColormap.from_list("mycmap_ai", ["white", "red"])
    if valid_decisions_ai > 0:
        cax_ai = ax_ai.imshow(counts_ai, interpolation='nearest', cmap=cmap_ai, extent=[0, 640, 480, 0], alpha=0.5)
    else:
        cax_ai = ax_ai.imshow(np.zeros_like(counts_ai), interpolation='nearest', cmap=cmap_ai, extent=[0, 640, 480, 0], alpha=0)

    cbar_ai = fig.colorbar(cax_ai, ax=ax_ai, fraction=0.046, pad=0.04)
    cbar_ai.set_label('Anzahl der Entscheidungen', fontsize=14)

    ax_ai.set_title('"KI-generiert"', fontsize=16)
    ax_ai.set_xticks(np.linspace(640 // 6, 640 - 640 // 6, 3))
    ax_ai.set_yticks(np.linspace(480 // 6, 480 - 480 // 6, 3))
    ax_ai.set_xticklabels(['A', 'B', 'C'])
    ax_ai.set_yticklabels(['1', '2', '3'])
    ax_ai.set_xlim(0, 640)
    ax_ai.set_ylim(480, 0)
    
    ax_real = axes[1]
    ax_real.imshow(img, extent=[0, 640, 480, 0])
    cmap_real = LinearSegmentedColormap.from_list("mycmap_real", ["white", "green"])
    if valid_decisions_real > 0:
        cax_real = ax_real.imshow(counts_real, interpolation='nearest', cmap=cmap_real, extent=[0, 640, 480, 0], alpha=0.5)
    else:
        cax_real = ax_real.imshow(np.zeros_like(counts_real), interpolation='nearest', cmap=cmap_real, extent=[0, 640, 480, 0], alpha=0)

    cbar_real = fig.colorbar(cax_real, ax=ax_real, fraction=0.046, pad=0.04)
    cbar_real.set_label('Anzahl der Entscheidungen', fontsize=14)

    ax_real.set_title('"echt"', fontsize=16)
    ax_real.set_xticks(np.linspace(640 // 6, 640 - 640 // 6, 3))
    ax_real.set_yticks(np.linspace(480 // 6, 480 - 480 // 6, 3))
    ax_real.set_xticklabels(['A', 'B', 'C'])
    ax_real.set_yticklabels(['1', '2', '3'])
    ax_real.set_xlim(0, 640)
    ax_real.set_ylim(480, 0)
    
    ax_ai.text(0.5, -0.1, f'n = {valid_decisions_ai}', ha='center', transform=ax_ai.transAxes, fontsize=12)
    ax_real.text(0.5, -0.1, f'n = {valid_decisions_real}', ha='center', transform=ax_real.transAxes, fontsize=12)

    plt.suptitle(f'Auswahl der Bildbereiche für {extract_from_end_until_underscore(image_path)}', fontsize=20)

    plt.subplots_adjust(bottom=0.2)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.savefig(f'statistics/imgs/heatmaps-{group}/heatmap_{img_number + 1}.jpg')
    plt.show()

In [None]:
for index, img in enumerate(area_selections_by_img_group_2):
    create_heatmap(img, index, img_paths_1[index], 'g2')

### 5.6.2 Overlap Probandenheatmaps und Detection-Model-Heatmaps

In [28]:
def get_heatmaps(area_selections, img_paths):
    heatmaps = []
    for df in area_selections:
        df.iloc[:, 0] = df.iloc[:, 0].replace({'echt': False, 'KI generiert': True})

    for img_number, df in enumerate(area_selections):
        image_path = img_paths[img_number]
        img = mpimg.imread(image_path)

        counts_ai = np.zeros((3, 3))
        counts_real = np.zeros((3, 3))
        
        valid_decisions_ai = 0
        valid_decisions_real = 0
        
        df_decisions = df.iloc[:, 2:5]
        df_truth_value = df.iloc[:, 0]

        for index, row in df_decisions.iterrows():
            if not row.isna().all():
                if df_truth_value[index] == True:
                    valid_decisions_ai += 1
                else:
                    valid_decisions_real += 1
                for row_idx in range(3):
                    column_value = row[row_idx]
                    if pd.isna(column_value):
                        continue
                    if ',' in column_value:
                        selections = column_value.split(', ')
                    else:
                        selections = [column_value]
                    for selection in selections:
                        col_idx = ord(selection) - ord('A')
                        if df_truth_value[index] == True:
                            counts_ai[row_idx, col_idx] += 1
                        else:
                            counts_real[row_idx, col_idx] += 1

        heatmaps.append((image_path, counts_ai, counts_real))
    return heatmaps

In [None]:
heatmaps_g1 = get_heatmaps(area_selections_by_img_group_1, img_paths_1)
heatmaps_g2 = get_heatmaps(area_selections_by_img_group_2, img_paths_2)
with open('./detection_model/model_heatmaps_g1.pkl', 'rb') as f:
    model_heatmaps_g1 = pickle.load(f)
with open('./detection_model/model_heatmaps_g2.pkl', 'rb') as f:
    model_heatmaps_g2 = pickle.load(f)

study_heatmaps = heatmaps_g1 + heatmaps_g2
model_heatmaps = model_heatmaps_g1 + model_heatmaps_g2

In [None]:
correlations_ai = []
p_values_ai = []
correlations_real = []
p_values_real = []
for study_heatmap, model_heatmap in zip(study_heatmaps, model_heatmaps):
    correlation_ai, p_value_ai = spearmanr(study_heatmap[1].flatten(), abs(model_heatmap[1].flatten()))
    correlation_real, p_value_real = spearmanr(study_heatmap[2].flatten(), abs(model_heatmap[2].flatten()))
    correlations_ai.append(correlation_ai)
    correlations_real.append(correlation_real)
    p_values_ai.append(p_value_ai)
    p_values_real.append(p_value_real)

correlations_ai = np.array(correlations_ai)
p_values_ai = np.array(p_values_ai)
correlations_real = np.array(correlations_real)
p_values_real = np.array(p_values_real)

mean_correlation_ai = np.mean(correlations_ai[~np.isnan(correlations_ai)])
mean_correlation_real = np.mean(correlations_real[~np.isnan(correlations_real)])
mean_p_value_ai = np.mean(p_values_ai[~np.isnan(p_values_ai)])
mean_p_value_real = np.mean(p_values_real[~np.isnan(p_values_real)])

print(mean_correlation_ai, mean_p_value_ai)
print(mean_correlation_real, mean_p_value_real)

### 5.6.3 Heatmaps nach Confusion-Matrix-Kategorie

In [None]:
def classifying_categories(solutions, area_selections):
    positives_ai = []
    positives_real = []
    for index, img in enumerate(solutions):
        if img['generated'] == 1:
            positives_ai.append(area_selections[index])
        else:
            positives_real.append(area_selections[index])

    true_positives_ai = []
    false_negatives_ai = []
    true_positives_real = []
    false_negatives_real = []

    for df in positives_ai:
        true_positives_ai.append(df[df.iloc[:, 0] == 'KI generiert'])
        false_negatives_ai.append(df[df.iloc[:, 0] == 'echt'])
    for df in positives_real:
        true_positives_real.append(df[df.iloc[:, 0] == 'echt'])
        false_negatives_real.append(df[df.iloc[:, 0] == 'KI generiert'])
    return positives_ai, positives_real, true_positives_ai, false_negatives_ai, true_positives_real, false_negatives_real

positives_ai_g1, positives_real_g1, true_positives_ai_g1, false_negatives_ai_g1, true_positives_real_g1, false_negatives_real_g1 = classifying_categories(solutions_g1, area_selections_by_img_group_1)
positives_ai_g2, positives_real_g2, true_positives_ai_g2, false_negatives_ai_g2, true_positives_real_g2, false_negatives_real_g2 = classifying_categories(solutions_g2, area_selections_by_img_group_2)
# wie viele Entscheidungen wurden verortet?
decisions = 0
spatial_decisions = 0
general_decision = 0
unsure_decisions = 0
wrong_labelled_decisions = 0
for group in [true_positives_ai_g1, true_positives_ai_g2]:
    for img in group:
        for row in img.iterrows():
            decisions += 1
            if row[1][1] == 'Nein, es war ein Gesamteindruck.':
                general_decision += 1
            if row[1][1] == 'Ich bin mir nicht sicher.':
                unsure_decisions += 1
            if not pd.isnull(row[1][2]) or not pd.isnull(row[1][3]) or not pd.isnull(row[1][4]):
                spatial_decisions += 1
                if row[1][1] == 'Nein, es war ein Gesamteindruck.' or row[1][1] == 'Ich bin mir nicht sicher.':
                    wrong_labelled_decisions += 1
            else:
                if row[1][1] == 'Ja, ich gebe ihn unten an.':
                    wrong_labelled_decisions += 1
                
decisions, spatial_decisions, spatial_decisions / decisions, general_decision, unsure_decisions, wrong_labelled_decisions

In [None]:
from matplotlib.colors import LinearSegmentedColormap
import matplotlib.image as mpimg

def create_heatmap(df, img_number, image_path, group, selection_type):
    img = mpimg.imread(image_path)
    counts = np.zeros((3, 3))
    valid_decisions = 0
    
    for index, row in df.iterrows():
        if not row.isna().all():
            valid_decisions += 1
            for row_idx in range(3):
                column_value = row[row_idx]
                if pd.isna(column_value):
                    continue
                if ',' in column_value:
                    selections = column_value.split(', ')
                else:
                    selections = [column_value]
                for selection in selections:
                    col_idx = ord(selection) - ord('A')
                    counts[row_idx, col_idx] += 1
    
    fig, ax = plt.subplots(figsize=(6.4, 4.8))
    ax.imshow(img, extent=[0, 640, 480, 0])
    cmap = LinearSegmentedColormap.from_list("mycmap", ["white", "red"])
    cax = ax.imshow(counts, interpolation='nearest', cmap=cmap, extent=[0, 640, 480, 0], alpha=0.5)
    fig.colorbar(cax)
    
    ax.set_title(f'Heatmap für Bild {img_number + 1}')
    ax.set_xticks(np.linspace(640 // 6, 640 - 640 // 6, 3))
    ax.set_yticks(np.linspace(480 // 6, 480 - 480 // 6, 3))
    ax.set_xticklabels(['A', 'B', 'C'])
    ax.set_yticklabels(['1', '2', '3'])
    ax.set_xlim(0, 640)
    ax.set_ylim(480, 0)
    plt.figtext(0.5, 0.01, f'Anzahl der Entscheidungen: {valid_decisions}', ha='center', fontsize=12)
    plt.savefig(f'statistics/imgs/heatmaps-{group}/{selection_type}heatmap_{img_number + 1}.jpg')
    plt.show()

for i, img in enumerate(false_negatives_ai):
    img_index = int(''.join(filter(str.isdigit, img.columns[0]))) - 1
    img = img.iloc[:, 2:5] 
    create_heatmap(img, img_index, img_paths_1[img_index], 'g1', 'false-negatives-ai/')