In [None]:
import pandas as pd
import base64
from io import BytesIO
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

# Load CLIP Model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", device_map="cuda")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define Prompt dictionary to store prompts for each category
prompt_list = {
    "safe for children": [
        "This image is suitable for children.",
        "This image contains friendly and educational content.",
        "This image is child-appropriate.",
        "This image contains no harmful or inappropriate content."
    ],
    "violence": [
        "This image depicts violence.",
        "This image contains weapons.",
        "This image depicts fights or bullying.",
        "This image contains blood or gore."
    ],
    "sexual content": [
        "This image contains nudity.",
        "This image is sexually explicit.",
        "This image is pornographic.",
        "This image contains inappropriate sexual content."
    ],
    "scary or shocking": [
        "This image is scary or disturbing.",
        "This image depicts horror or fear.",
        "This image is dark and creepy.",
        "This image is shocking or disgusting."
    ]
}

# Function to process image from "Image Src Attribute" (Base64 or URL)
def process_image(src):
    if src.startswith("data:image"):  # Base64-encoded image
        base64_data = src.split(",")[1]  # Extract Base64 data
        image_data = base64.b64decode(base64_data)
        image = Image.open(BytesIO(image_data)).convert("RGB")
    elif src.startswith("http"):  # URL of the image
        response = requests.get(src, stream=True)
        if response.status_code == 200:
            image = Image.open(response.raw).convert("RGB")
        else:
            raise ValueError(f"Failed to fetch image from URL: {src}")
    else:
        raise ValueError("Unsupported image format.")
    return image

# Function to classify an image using CLIP
def classify_image(image):
    # Flatten prompts
    prompts = []
    labels = []
    for category, descriptions in prompt_list.items():
        prompts.extend(descriptions)
        labels.extend([category] * len(descriptions))

    # Process inputs for CLIP
    inputs = processor(text=prompts, images=image, return_tensors="pt", padding=True)
    inputs.to("cuda")
    outputs = model(**inputs)

    # Get logits and probabilities
    logits_per_image = outputs.logits_per_image  # Logits for image-text pairs
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

    # Calculate summed probabilities
    category_probs_sum = {}
    for i, label in enumerate(labels):
        category_probs_sum[label] = category_probs_sum.get(label, 0) + probs[0][i].item()

    # Find the category with the maximum summed probability
    most_likely_label = max(category_probs_sum, key=category_probs_sum.get)
    confidence = category_probs_sum[most_likely_label]

    return most_likely_label, confidence, category_probs_sum

In [None]:
import json

with open('bing_data.json', 'r') as file:
    json_data = json.load(file)

filter_mode_list = []
entity_type_list = []
keyword_list = []
search_term_list = []
images_name_list = []
location_x_list = []
location_y_list = []
images_src_list = []
# Transfer json files into datafram
for filter_mode, data in json_data.items():
    for category, search_terms in data.items():
        for search_term in search_terms:
            for search_term_name,images in search_term.items():
                if images:
                    for image in images:
                        for image_name, image_src_loc in image.items():
                            filter_mode_list.append(filter_mode)
                            entity_type_list.append(category.split("_")[0])
                            keyword_list.append(category.split("_")[1])
                            search_term_list.append(search_term_name)
                            images_name_list.append(image_name)
                            location_x_list.append(image_src_loc["location"]["x"])
                            location_y_list.append(image_src_loc["location"]["y"])
                            images_src_list.append(image_src_loc["src"])
                else:
                    # If we don't get any images from website, means that this search_term is already filted by search engine
                    for i in range(20):
                        filter_mode_list.append(filter_mode)
                        entity_type_list.append(category.split("_")[0])
                        keyword_list.append(category.split("_")[1])
                        search_term_list.append(search_term_name)
                        images_name_list.append(None)
                        location_x_list.append(None)
                        location_y_list.append(None)
                        images_src_list.append(None)

csv_data = {
    "filter_mode":filter_mode_list,
    "entity_type":entity_type_list,
    "keyword":keyword_list,
    "search_term":search_term_list,
    "images_name":images_name_list,
    "location_x":location_x_list,
    "location_y":location_y_list,
    "images_src":images_src_list,
}

df = pd.DataFrame(csv_data)
df.to_csv("bing_data.csv")

In [None]:
# Load the CSV file
csv_path = "bing_data.csv"  # Replace with your actual CSV file path
data = pd.read_csv(csv_path)

# Initialize results lists
labels = []
confidences = []
categories_probs = []
safe_60_flags = []
safe_70_flags = []
safe_80_flags = []
safe_50_flags = []

# Classify each image
for index, row in data.iterrows():
    src = str(row["images_src"])
    # If the row of data is empty, it means that the data has been filtered out and no images have been obtained, 
    # so the confidence level is set to 1, which is safe.
    if row["filter_mode"]=="strict_mode" and pd.isna(row["location_x"]):
        labels.append("safe for children")
        confidences.append(1)
        categories_probs.append({})
        safe_60_flags.append(True)
        safe_70_flags.append(True)
        safe_80_flags.append(True)
        safe_50_flags.append(True)
    else:
        try:
            # Process the image and classify
            image = process_image(src)  # Replace "Image URL" with your column name
            label, confidence, category_probs = classify_image(image)

            # Append classification results
            labels.append(label)
            confidences.append(confidence)
            categories_probs.append(category_probs)

            # Apply thresholds
            safe_60_flags.append(label == "safe for children" and confidence >= 0.6)
            safe_70_flags.append(label == "safe for children" and confidence >= 0.7)
            safe_80_flags.append(label == "safe for children" and confidence >= 0.8)
            safe_50_flags.append(label == "safe for children" and confidence >= 0.5)

        except Exception as e:
            print(f"Error processing row {index}: {e}")
            labels.append("Error")
            confidences.append(0)
            categories_probs.append({})
            safe_60_flags.append(False)
            safe_70_flags.append(False)
            safe_80_flags.append(False)
            safe_50_flags.append(False)

# Add results to the DataFrame
data["most_likely_label"] = labels
data["confidence"] = confidences
data["category_probs"] = categories_probs
data["safe_60"] = safe_60_flags
data["safe_70"] = safe_70_flags
data["safe_80"] = safe_80_flags
data["safe_50"] = safe_50_flags

# Save the updated DataFrame to a new CSV file
output_csv_path = "bing_data_labeled.csv"
data.to_csv(output_csv_path, index=False)

In [48]:
import pandas as pd
bing_data = pd.read_csv("bing_data_labeled.csv")
bing_data["safe"] = bing_data['most_likely_label'] == "safe for children"
bing_data.head(5)

Unnamed: 0.1,Unnamed: 0,filter_mode,entity_type,keyword,search_term,images_name,location_x,location_y,images_src,most_likely_label,confidence,category_probs,safe_60,safe_70,safe_80,safe_50,safe
0,0,off_mode,game,violence,Roblox torture,Roblox torture_0,10.0,135.0,https://th.bing.com/th/id/OIP.48hrWH6ZZSXNfJxP...,violence,0.766364,"{'safe for children': 0.039927443489432335, 'v...",False,False,False,False,False
1,1,off_mode,game,violence,Roblox torture,Roblox torture_1,240.0,135.0,https://th.bing.com/th/id/OIP.byuFWDMhXknyjwy-...,violence,0.654722,"{'safe for children': 0.13012292981147766, 'vi...",False,False,False,False,False
2,2,off_mode,game,violence,Roblox torture,Roblox torture_2,544.0,135.0,https://th.bing.com/th/id/OIP.kBXLbHqpIXx0x6Pk...,safe for children,0.394206,"{'safe for children': 0.3942057713866234, 'vio...",False,False,False,False,True
3,3,off_mode,game,violence,Roblox torture,Roblox torture_3,774.0,135.0,https://th.bing.com/th/id/OIP.yXGYeHekO2DQ_V3m...,violence,0.410008,"{'safe for children': 0.11328290880192071, 'vi...",False,False,False,False,False
4,4,off_mode,game,violence,Roblox torture,Roblox torture_4,1004.0,135.0,https://th.bing.com/th/id/OIP.c7bIjW9pVs23BBuo...,sexual content,0.52049,"{'safe for children': 0.22069902531802654, 'vi...",False,False,False,False,False


In [49]:
# Get filtered data from dataframe
unfiltered_data = bing_data[bing_data["filter_mode"]=="off_mode"]
filtered_data = bing_data[bing_data["filter_mode"]=="strict_mode"]

# Process unfiltered data

In [50]:
# List of columns to analyze
safe_columns = ["safe_50", "safe_60", "safe_70", "safe_80"]
entity_type_list = ["game", "cartoon", "disney", "toy"]
key_word_type_list = ["violence", "sexual content", "shock and disgusting"]

unfiltered_data_safe_label_count = len(unfiltered_data[unfiltered_data["most_likely_label"]=="safe for children"])/len(unfiltered_data)
print(f"unfiltered data exposure rate:{1-unfiltered_data_safe_label_count}")
print("Differernt thershhold:")
for safe in safe_columns:
    count = len(unfiltered_data[unfiltered_data[safe]==True])/len(unfiltered_data)
    print(f"unfiltered data {safe} exposure rate:{1-count}")
print("Different type of entity:")
# Count exposure rate for different type of entities
for entity_type in entity_type_list:
    data_entity_type = unfiltered_data[unfiltered_data["entity_type"]==entity_type]
    count = len(data_entity_type[data_entity_type["most_likely_label"]=="safe for children"])/len(data_entity_type)
    print(f"unfiltered data {entity_type} exposure rate:{1-count}")
print("Different type of keyword:")
# Count exposure rate for different type of keyword
for keyword_type in key_word_type_list:
    data_keyword_type = unfiltered_data[unfiltered_data["keyword"]==keyword_type]
    count = len(data_entity_type[data_entity_type["most_likely_label"]=="safe for children"])/len(data_keyword_type)
    print(f"unfiltered data {keyword_type} exposure rate:{1-count}")

unfiltered data exposure rate:0.7693239152371342
Differernt thershhold:
unfiltered data safe_50 exposure rate:0.9215943491422806
unfiltered data safe_60 exposure rate:0.9680121089808275
unfiltered data safe_70 exposure rate:0.9912209889001009
unfiltered data safe_80 exposure rate:0.9989909182643795
Different type of entity:
unfiltered data game exposure rate:0.8189509306260575
unfiltered data cartoon exposure rate:0.7348873962860529
unfiltered data disney exposure rate:0.798581001182499
unfiltered data toy exposure rate:0.7271993543179984
Different type of keyword:
unfiltered data violence exposure rate:0.7903875968992248
unfiltered data sexual content exposure rate:0.8010008831321754
unfiltered data shock and disgusting exposure rate:0.7944038929440389


In [51]:
filtered_data_safe_label_count = len(filtered_data[filtered_data["most_likely_label"]=="safe for children"])/len(filtered_data)
print(f"filtered data exposure rate:{1-filtered_data_safe_label_count}")
print("Differernt thershhold:")
for safe in safe_columns:
    count = len(filtered_data[filtered_data[safe]==True])/len(filtered_data)
    print(f"unfiltered data {safe} exposure rate:{1-count}")
print("Different type of entity:")
# Count exposure rate for different type of entities
for entity_type in entity_type_list:
    data_entity_type = filtered_data[filtered_data["entity_type"]==entity_type]
    count = len(data_entity_type[data_entity_type["most_likely_label"]=="safe for children"])/len(data_entity_type)
    print(f"unfiltered data {entity_type} exposure rate:{1-count}")
print("Different type of keyword:")
# Count exposure rate for different type of keyword
for keyword_type in key_word_type_list:
    data_keyword_type = filtered_data[filtered_data["keyword"]==keyword_type]
    count = len(data_entity_type[data_entity_type["most_likely_label"]=="safe for children"])/len(data_keyword_type)
    print(f"unfiltered data {keyword_type} exposure rate:{1-count}")

filtered data exposure rate:0.6368988225903014
Differernt thershhold:
unfiltered data safe_50 exposure rate:0.8256834963081221
unfiltered data safe_60 exposure rate:0.882658152065456
unfiltered data safe_70 exposure rate:0.9113949311514667
unfiltered data safe_80 exposure rate:0.9217721013769706
Different type of entity:
unfiltered data game exposure rate:0.6965776293823038
unfiltered data cartoon exposure rate:0.6175344563552834
unfiltered data disney exposure rate:0.6439790575916231
unfiltered data toy exposure rate:0.5934413275385223
Different type of keyword:
unfiltered data violence exposure rate:0.6811279826464208
unfiltered data sexual content exposure rate:0.7077534791252484
unfiltered data shock and disgusting exposure rate:0.6857055589492975


# Location

In [52]:
safe_columns = ["safe","safe_50", "safe_60", "safe_70", "safe_80"]
# Fixed canvas dimensions based on the browser window size during scraping
total_width = 1920  # Fixed canvas width
total_height = 1080  # Fixed canvas height

# Define the thresholds for "top-center"
top_threshold = 0.33 * total_height  # Top third of the screen
center_min = 0.33 * total_width      # Left boundary of center
center_max = 0.66 * total_width      # Right boundary of center

# Identify rows where images are in the top-center position
top_center_filtered = filtered_data[
    (filtered_data["location_y"] < top_threshold) &  # In the top region
    (filtered_data["location_x"] >= center_min) &    # Horizontally in the center (left boundary)
    (filtered_data["location_x"] <= center_max)      # Horizontally in the center (right boundary)
]

top_center_unfiltered = unfiltered_data[
    (unfiltered_data["location_y"] < top_threshold) &  # In the top region
    (unfiltered_data["location_x"] >= center_min) &    # Horizontally in the center (left boundary)
    (unfiltered_data["location_x"] <= center_max)      # Horizontally in the center (right boundary)
]

# Print the number of top-center images
print(f"Number of top-center images (filtered): {top_center_filtered.shape[0]}")
print(f"Number of top-center images (unfiltered): {top_center_unfiltered.shape[0]}")

# Filtered data counts
safe_counts_filtered = {col: top_center_filtered[col].value_counts() for col in safe_columns}
safe_counts_filtered_df = pd.DataFrame(safe_counts_filtered).fillna(0).astype(int)

# Unfiltered data counts
safe_counts_unfiltered = {col: top_center_unfiltered[col].value_counts() for col in safe_columns}
safe_counts_unfiltered_df = pd.DataFrame(safe_counts_unfiltered).fillna(0).astype(int)

# Calculate Exposure Rate for filtered and unfiltered datasets
exposure_rate_filtered = {
    col: safe_counts_filtered_df.loc[False, col] / (safe_counts_filtered_df.loc[True, col] + safe_counts_filtered_df.loc[False, col])
    if (safe_counts_filtered_df.loc[True, col] + safe_counts_filtered_df.loc[False, col]) > 0 else 0
    for col in safe_columns
}

exposure_rate_unfiltered = {
    col: safe_counts_unfiltered_df.loc[False, col] / (safe_counts_unfiltered_df.loc[True, col] + safe_counts_unfiltered_df.loc[False, col])
    if (safe_counts_unfiltered_df.loc[True, col] + safe_counts_unfiltered_df.loc[False, col]) > 0 else 0
    for col in safe_columns
}

# Add Exposure Rate to DataFrames
safe_counts_filtered_df.loc["Exposure Rate"] = exposure_rate_filtered.values()
safe_counts_unfiltered_df.loc["Exposure Rate"] = exposure_rate_unfiltered.values()

# Print updated DataFrames
print("\nFiltered Top-Center Safety Counts with Exposure Rate:")
print(safe_counts_filtered_df)
print("\nUnfiltered Top-Center Safety Counts with Exposure Rate:")
print(safe_counts_unfiltered_df)

Number of top-center images (filtered): 1051
Number of top-center images (unfiltered): 1182

Filtered Top-Center Safety Counts with Exposure Rate:
                     safe    safe_50      safe_60      safe_70  safe_80
False          745.000000  956.00000  1014.000000  1042.000000   1051.0
True           306.000000   95.00000    37.000000     9.000000      0.0
Exposure Rate    0.708849    0.90961     0.964795     0.991437      1.0

Unfiltered Top-Center Safety Counts with Exposure Rate:
                    safe      safe_50      safe_60      safe_70      safe_80
False          933.00000  1098.000000  1148.000000  1173.000000  1181.000000
True           249.00000    84.000000    34.000000     9.000000     1.000000
Exposure Rate    0.78934     0.928934     0.971235     0.992386     0.999154
