<a href="https://colab.research.google.com/github/michellecxy015/search-engines/blob/main/code/google_clip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pandas torch torchvision transformers Pillow



In [1]:
import pandas as pd
import base64
from io import BytesIO
from PIL import Image
import requests
from transformers import CLIPProcessor, CLIPModel

In [14]:

# Load CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define categories with prompts
categories = {
    "safe for children": [
        "This image is suitable for children.",
        "This image contains friendly and educational content.",
        "This image is child-appropriate.",
        "This image contains no harmful or inappropriate content."
    ],
    "violence": [
        "This image depicts violence.",
        "This image contains weapons.",
        "This image depicts fights or bullying.",
        "This image contains blood or gore."
    ],
    "sexual content": [
        "This image contains nudity.",
        "This image is sexually explicit.",
        "This image is pornographic.",
        "This image contains inappropriate sexual content."
    ],
    "scary or shocking": [
        "This image is scary or disturbing.",
        "This image depicts horror or fear.",
        "This image is dark and creepy.",
        "This image is shocking or disgusting."
    ]
}

# Function to process image from "Image URL" (Base64 or HTTP URL)
def process_image(src):
    if src.startswith("data:image"):  # Base64-encoded image
        base64_data = src.split(",")[1]  # Extract Base64 data
        image_data = base64.b64decode(base64_data)
        image = Image.open(BytesIO(image_data)).convert("RGB")
    elif src.startswith("http"):  # URL of the image
        response = requests.get(src, stream=True)
        if response.status_code == 200:
            image = Image.open(response.raw).convert("RGB")
        else:
            raise ValueError(f"Failed to fetch image from URL: {src}")
    else:
        raise ValueError("Unsupported image format.")
    return image


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [35]:

# Function to classify an image using CLIP
def classify_image(image):
    # Flatten prompts
    prompts = []
    labels = []
    for category, descriptions in categories.items():
        prompts.extend(descriptions)
        labels.extend([category] * len(descriptions))

    # Process inputs for CLIP
    inputs = processor(text=prompts, images=image, return_tensors="pt", padding=True)
    outputs = model(**inputs)

    # Get logits and probabilities
    logits_per_image = outputs.logits_per_image  # Logits for image-text pairs
    probs = logits_per_image.softmax(dim=1)  # Convert to probabilities

    # Calculate summed probabilities
    category_probs_sum = {}
    for i, label in enumerate(labels):
        category_probs_sum[label] = category_probs_sum.get(label, 0) + probs[0][i].item()

    # Find the category with the maximum summed probability
    most_likely_label = max(category_probs_sum, key=category_probs_sum.get)
    confidence = category_probs_sum[most_likely_label]

    return most_likely_label, confidence, category_probs_sum

In [7]:
csv_path = "/content/drive/MyDrive/filter_image_metadata.csv"  # Replace with your actual CSV file path
data = pd.read_csv(csv_path)
# Specify the columns to check for 0 values
columns_to_check = ["X", "Y", "Width", "Height"]

# Filter out rows where any of the specified columns have 0 values
filtered_data = data[(data[columns_to_check] != 0).all(axis=1)]

# Print the number of rows removed
rows_removed = data.shape[0] - filtered_data.shape[0]
print(f"Number of rows removed: {rows_removed}")

# Save the filtered dataset to a new CSV file
output_csv_path = "/content/drive/MyDrive/filtered_data_cleaned.csv"
filtered_data.to_csv(output_csv_path, index=False)
print(f"Filtered data saved to {output_csv_path}.")


Number of rows removed: 185
Filtered data saved to /content/drive/MyDrive/filtered_data_cleaned.csv.


In [None]:

# Load the CSV file
csv_path = "/content/drive/MyDrive/filtered_data_cleaned.csv"  # Replace with your actual CSV file path
data = pd.read_csv(csv_path)

# Initialize results lists
labels = []
confidences = []
categories_probs = []
safe_60_flags = []
safe_70_flags = []
safe_80_flags = []
safe_50_flags = []

# Classify each image
for i, row in data.iterrows():
    try:
        # Process the image and classify
        image = process_image(row["Image URL"])  # Replace "Image URL" with your column name
        label, confidence, category_probs = classify_image(image)

        # Append classification results
        labels.append(label)
        confidences.append(confidence)
        categories_probs.append(category_probs)

        # Apply thresholds
        safe_60_flags.append(label == "safe for children" and confidence >= 0.6)
        safe_70_flags.append(label == "safe for children" and confidence >= 0.7)
        safe_80_flags.append(label == "safe for children" and confidence >= 0.8)
        safe_50_flags.append(label == "safe for children" and confidence >= 0.5)

    except Exception as e:
        print(f"Error processing row {i}: {e}")
        labels.append("Error")
        confidences.append(0)
        categories_probs.append({})
        safe_60_flags.append(False)
        safe_70_flags.append(False)
        safe_80_flags.append(False)
        safe_50_flags.append(False)

# Add results to the DataFrame
data["most_likely_label"] = labels
data["confidence"] = confidences
data["category_probs"] = categories_probs
data["safe_60"] = safe_60_flags
data["safe_70"] = safe_70_flags
data["safe_80"] = safe_80_flags
data["safe_50"] = safe_50_flags

# Save the updated DataFrame to a new CSV file
output_csv_path = "/content/drive/MyDrive/filtered_classified_image_data_label.csv"
data.to_csv(output_csv_path, index=False)

In [11]:
csv_path = "/content/drive/MyDrive/unfilter_image_metadata.csv"# Replace with your actual CSV file path
data = pd.read_csv(csv_path)
# Specify the columns to check for 0 values
columns_to_check = ["X", "Y", "Width", "Height"]

# Filter out rows where any of the specified columns have 0 values
filtered_data = data[(data[columns_to_check] != 0).all(axis=1)]

# Print the number of rows removed
rows_removed = data.shape[0] - filtered_data.shape[0]
print(f"Number of rows removed: {rows_removed}")

# Save the filtered dataset to a new CSV file
output_csv_path = "/content/drive/MyDrive/unfiltered_data_cleaned.csv"
filtered_data.to_csv(output_csv_path, index=False)
print(f"Filtered data saved to {output_csv_path}.")

Number of rows removed: 145
Filtered data saved to /content/drive/MyDrive/unfiltered_data_cleaned.csv.


In [None]:

# Load the CSV file
csv_path = "/content/drive/MyDrive/unfiltered_data_cleaned.csv"  # Replace with your actual CSV file path
data = pd.read_csv(csv_path)

# Initialize results lists
labels = []
confidences = []
categories_probs = []
safe_60_flags = []
safe_70_flags = []
safe_80_flags = []
safe_50_flags = []

# Classify each image
for i, row in data.iterrows():
    try:
        # Process the image and classify
        image = process_image(row["Image URL"])  # Replace "Image URL" with your column name
        label, confidence, category_probs = classify_image(image)

        # Append classification results
        labels.append(label)
        confidences.append(confidence)
        categories_probs.append(category_probs)

        # Apply thresholds
        safe_60_flags.append(label == "safe for children" and confidence >= 0.6)
        safe_70_flags.append(label == "safe for children" and confidence >= 0.7)
        safe_80_flags.append(label == "safe for children" and confidence >= 0.8)
        safe_50_flags.append(label == "safe for children" and confidence >= 0.5)

    except Exception as e:
        print(f"Error processing row {i}: {e}")
        labels.append("Error")
        confidences.append(0)
        categories_probs.append({})
        safe_60_flags.append(False)
        safe_70_flags.append(False)
        safe_80_flags.append(False)
        safe_50_flags.append(False)

# Add results to the DataFrame
data["most_likely_label"] = labels
data["confidence"] = confidences
data["category_probs"] = categories_probs
data["safe_60"] = safe_60_flags
data["safe_70"] = safe_70_flags
data["safe_80"] = safe_80_flags
data["safe_50"] = safe_50_flags

# Save the updated DataFrame to a new CSV file
output_csv_path = "/content/drive/MyDrive/unfiltered_classified_image_data_with_label.csv"
data.to_csv(output_csv_path, index=False)


# term filtered when searching

In [15]:
csv_path = "/content/drive/MyDrive/filtered_classified_image_data_label.csv"  # Replace with your actual CSV file path
filter_data = pd.read_csv(csv_path)
print(f"Total rows: {filter_data.shape[0]}")
# Alternatively, print the entire shape
print(f"DataFrame shape: {filter_data.shape}")
# Count occurrences of each unique value in the "search term" column
filter_search_term_counts = filter_data["Search Term"].value_counts()
# Print the counts
print(filter_search_term_counts)
print(filter_data.head(5))

Total rows: 8314
DataFrame shape: (8314, 19)
Search Term
Barbie bloody                        29
Elemental violence                   28
Pokémon Go fuck                      28
Barbie torture                       28
Gravity Falls dismemberment          28
                                     ..
Elemental explicit                   12
Turning Red assault                  12
Pirates of the Caribbean violence    12
Roblox dark                          12
Roblox horror                        12
Name: count, Length: 475, dtype: int64
  Entity Type  Entity Harmful Type Harmful Term     Search Term  \
0        game  Roblox     violence      torture  Roblox torture   
1        game  Roblox     violence      torture  Roblox torture   
2        game  Roblox     violence      torture  Roblox torture   
3        game  Roblox     violence      torture  Roblox torture   
4        game  Roblox     violence      torture  Roblox torture   

  SafeSearch Mode               File Name  \
0          filte

In [16]:
csv_path = "/content/drive/MyDrive/unfiltered_classified_image_data_with_label.csv"  # Replace with your actual CSV file path
unfilter_data = pd.read_csv(csv_path)
print(f"Total rows: {unfilter_data.shape[0]}")
# Alternatively, print the entire shape
print(f"DataFrame shape: {unfilter_data.shape}")
# Count occurrences of each unique value in the "search term" column
unfilter_search_term_counts = unfilter_data["Search Term"].value_counts()
# Print the counts
print(unfilter_search_term_counts)
print(unfilter_data.head(5))

Total rows: 12215
DataFrame shape: (12215, 19)
Search Term
Barbie torture             37
Rubik’s Cube nude          37
Lego dick                  37
Elemental violence         36
Hot Wheels decapitation    36
                           ..
Minecraft horrible         18
Pokémon Go horrible        18
Among Us fights            18
Hot Wheels fights          18
Roblox horror              18
Name: count, Length: 480, dtype: int64
  Entity Type  Entity Harmful Type Harmful Term     Search Term  \
0        game  Roblox     violence      torture  Roblox torture   
1        game  Roblox     violence      torture  Roblox torture   
2        game  Roblox     violence      torture  Roblox torture   
3        game  Roblox     violence      torture  Roblox torture   
4        game  Roblox     violence      torture  Roblox torture   

  SafeSearch Mode               File Name  \
0             aus  Roblox torture_image_1   
1             aus  Roblox torture_image_2   
2             aus  Roblox torture_

the terms that searched are blocked, therefore no result:

In [17]:
# Extract unique search terms from each dataset
filter_search_terms = set(filter_data["Search Term"].unique())
unfilter_search_terms = set(unfilter_data["Search Term"].unique())
# Find search terms in unfiltered but not in filtered
terms_only_in_unfiltered = unfilter_search_terms - filter_search_terms
# Print the results
print(f"Search terms in unfiltered but not in filtered: {terms_only_in_unfiltered}")

Search terms in unfiltered but not in filtered: {'Powerpuff Girls porn', 'Turning Red nude', 'Gravity Falls fuck', 'Turning Red fuck', 'Elemental fuck'}


# images filtered

In [21]:
# List of columns to analyze
safe_columns = ["safe_50", "safe_60", "safe_70", "safe_80"]

# Create a dictionary to store True and False counts
counts = {col: filter_data[col].value_counts() for col in safe_columns}

# Convert the dictionary to a DataFrame
counts_df = pd.DataFrame(counts).fillna(0).astype(int)  # Fill missing values with 0

# Calculate Exposure Rate with total images as denominator
exposure_rate = {
    col: counts_df.loc[False, col] / (counts_df.loc[True, col] + counts_df.loc[False, col])
    if (counts_df.loc[True, col] + counts_df.loc[False, col]) > 0 else 0
    for col in safe_columns
}

# Add Exposure Rate to the table
counts_df.loc["Exposure Rate"] = exposure_rate.values()

# Print the updated counts DataFrame
print(counts_df)


                   safe_50      safe_60      safe_70      safe_80
False          7457.000000  7966.000000  8194.000000  8280.000000
True            857.000000   348.000000   120.000000    34.000000
Exposure Rate     0.896921     0.958143     0.985567     0.995911


In [22]:
# List of columns to analyze
safe_columns = ["safe_50", "safe_60", "safe_70", "safe_80"]

# Create a dictionary to store True and False counts
counts = {col: unfilter_data[col].value_counts() for col in safe_columns}

# Convert the dictionary to a DataFrame
counts_df = pd.DataFrame(counts).fillna(0).astype(int)  # Fill missing values with 0

# Calculate Exposure Rate with total images as denominator
exposure_rate = {
    col: counts_df.loc[False, col] / (counts_df.loc[True, col] + counts_df.loc[False, col])
    if (counts_df.loc[True, col] + counts_df.loc[False, col]) > 0 else 0
    for col in safe_columns
}

# Add Exposure Rate to the table
counts_df.loc["Exposure Rate"] = exposure_rate.values()

# Print the updated counts DataFrame
print(counts_df)

                    safe_50       safe_60       safe_70       safe_80
False          11158.000000  11803.000000  12084.000000  12180.000000
True            1057.000000    412.000000    131.000000     35.000000
Exposure Rate      0.913467      0.966271      0.989275      0.997135
