## Since we now have the identification accuracy results for every image in the test group of the celebrities_all dataset, we'll want to create a new datframe to store the accuracy rates along with other info about each test image. 

### To achieve this, we will use the same code from the first EDA cycle.

## Before running the code, install the following libraries:

pip install pandas

pip install pillow

pip install numpy

pip install imutils

pip install opencv-python

In [1]:
import pandas as pd
from PIL import Image, ImageStat
import os
import numpy as np

source_dir = "/workspace/DS4002Project3/DATA/celebrities_all" # set source directory

# Calculate the brightness of an image

# make a function that will calculate the average brightness of an image
def brightness(im_file):
    im = Image.open(im_file).convert('L')  # Convert to grayscale
    stat = ImageStat.Stat(im)
    return stat.mean[0]  # Mean brightness

# make a function that will take the filename of an image (01-100) and convert to integer
def extract_number_from_filename(filename):
    # Split at the dot and take the part before it
    number_str = filename.split('.')[0]
    # Convert the resulting string to an integer
    return int(number_str)

# make a function to list the race of the celebrity photographed
def get_race(celebrity_name):
    if celebrity_name == "Angelina-Jolie":
        return "White"
    elif celebrity_name == "America-Ferrera":
        return "White-Latina"
    elif celebrity_name == "Ayo-Edebiri":
        return "Black"
    elif celebrity_name == "Conan-OBrien":
        return "White"
    elif celebrity_name == "Danny-Pudi":
        return "Asian"
    elif celebrity_name == "David-Bowie":
        return "White"
    elif celebrity_name == "Donald-Glover":
        return "Black"
    elif celebrity_name == "Elizabeth-Olsen":
        return "White"
    elif celebrity_name == "Jackie-Chan":
        return "Asian"
    elif celebrity_name == "Jim-Carrey":
        return "White"
    elif celebrity_name == "John-Lennon":
        return "White"
    elif celebrity_name == "John-Mulaney":
        return "White"
    elif celebrity_name == "Lucy-Liu":
        return "Asian"
    elif celebrity_name == "Margot-Robbie":
        return "White"
    elif celebrity_name == "Mariah-Carey":
        return "Black-White"
    elif celebrity_name == "Matt-Damon":
        return "White"
    elif celebrity_name == "Maya-Rudolph":
        return "Black-White"
    elif celebrity_name == "Morgan-Freeman":
        return "Black"
    elif celebrity_name == "Olivia-Rodrigo":
        return "Asian-White"
    elif celebrity_name == "Pedro-Pascal":
        return "White"
    elif celebrity_name == "Priyanka-Chopra":
        return "Asian"
    elif celebrity_name == "Ryan-Gosling":
        return "White"
    elif celebrity_name == "SZA":
        return "Black"
    elif celebrity_name == "Salma-Hayek":
        return "White-Latina"
    elif celebrity_name == "Will-Smith":
        return "Black"
    else:
        return "Unknown"

from PIL import Image

# make a function that states the resolution of a photo
def get_resolution(img_path):
    try:
        with Image.open(img_path) as img:
            wid, hgt = img.size
            resolution = f"{wid}x{hgt}"
            return resolution
    except Exception as e:
        return None

# function to calculate saturation. --> adopted from https://pyimagesearch.com/2017/06/05/computing-image-colorfulness-with-opencv-and-python/
from imutils import build_montages
from imutils import paths
import argparse
import imutils
import cv2

# Create a DataFrame including brightness information
import os

def create_image_dataframe_from_source():
    data = []

    # Process each celebrity folder in the source directory
    for celeb_folder in os.listdir(source_dir):
        celeb_path = os.path.join(source_dir, celeb_folder)

        if os.path.isdir(celeb_path):
            for i, img_name in enumerate(os.listdir(celeb_path), start=1):
                # Construct the complete file path
                img_path = os.path.abspath(os.path.join(celeb_path, img_name))

                # Calculate brightness for the image
                img_brightness = brightness(img_path)

                # Calculate resolution for the image
                img_resolution = get_resolution(img_path)

                # Append the data to the list
                data.append({
                    'celebrity_name': celeb_folder,
                    'picture_number': extract_number_from_filename(img_name),
                    'file_path': img_path,  # Save the complete file path
                    'brightness': img_brightness,
                    'resolution': img_resolution
                })

    # Create DataFrame -- first add original columns
    df = pd.DataFrame(data, columns=['celebrity_name', 'picture_number', 'file_path', 'brightness', 'resolution'])
    # add race column
    df['race'] = df['celebrity_name'].apply(get_race)
    return df

df = create_image_dataframe_from_source()
# Sort by celebrity_name and then by picture_number
df = df.sort_values(by=['celebrity_name', 'picture_number'], ascending=[True, True])
# Reset index if needed
df = df.reset_index(drop=True)

# add gender column:
def gender_specification(dataframe):
    # Create a new column "gender" and assign values based on celebrity_name
    df['gender'] = df['celebrity_name'].apply(
        lambda name: 'female' if name in ['Angelina-Jolie', 'America-Ferrera', 'Ayo-Edebiri', 'Elizabeth-Olsen', 'Lucy-Liu', 'Margot-Robbie', 'Mariah-Carrie', 'Olivia-Rodrigo', 'Salma-Hayek', 'SZA']
        else 'male' if name in ['Conan-OBrien', 'Danny-Pudi', 'David-Bowie', 'Donald-Glover', 'Jackie-Chan', 'Jim-Carrey', 'John-Lennon', 'John-Mulaney', 'Matt-Damon', 'Morgan-Freeman', 'Pedro-Pascal', 'Ryan-Gosling', 'Will-Smith']
        else None
    )
    return df

gender_specification(df)

# add saturation column
import cv2

def image_colorfulness(image):
    # Split the image into its respective RGB components
    (B, G, R) = cv2.split(image.astype("float"))

    # Compute rg = R - G
    rg = np.absolute(R - G)

    # Compute yb = 0.5 * (R + G) - B
    yb = np.absolute(0.5 * (R + G) - B)

    # Compute the mean and standard deviation of both `rg` and `yb`
    (rgMean, rgStd) = (np.mean(rg), np.std(rg))
    (ybMean, ybStd) = (np.mean(yb), np.std(yb))

    # Combine the mean and standard deviations
    stdRoot = np.sqrt((rgStd ** 2) + (ybStd ** 2))
    meanRoot = np.sqrt((rgMean ** 2) + (ybMean ** 2))

    # Derive the "colorfulness" metric and return it
    return stdRoot + (0.3 * meanRoot)

# List to hold colorfulness values
colorfulness_values = []

# Loop through each file path in the DataFrame
for filepath in df['file_path']:
    # Load the image
    image = cv2.imread(filepath)

    # Check if the image was loaded correctly
    if image is not None:
        # Calculate the colorfulness
        colorfulness_value = image_colorfulness(image)
    else:
        print(f"Warning: Could not load image at {filepath}")
        colorfulness_value = None  # Use None for missing images

    # Append the colorfulness value
    colorfulness_values.append(colorfulness_value)

# Add the colorfulness values as a new column in the DataFrame
df['colorfulness'] = colorfulness_values

df.to_csv('/workspace/DS4002Project3/DATA/all_celebs_image_data.csv', index=False)

### Let's add a new column, "augmented", to the dataframe, which is equal to 0 if the file path also exists in our original image csv before the augmented images were added.

In [10]:

# Load the datasets
all_celebs = pd.read_csv("/workspace/DS4002Project3/DATA/all_celebs_image_data.csv")
celeb_data = pd.read_csv("/workspace/DS4002Project3/DATA/celeb_image_data.csv")

# Remove the leading directory paths and extract the relevant filename (without directory part)
all_celebs['file_name'] = all_celebs['file_path'].str.extract(r'celebrities_all/(.+)')
celeb_data['file_name'] = celeb_data['file_path'].str.extract(r'celebrities/(.+)')

# Ensure the file names have consistent zero-padding (e.g., '06.jpg' and '006.jpg' should match)
all_celebs['file_name'] = all_celebs['file_name'].apply(lambda x: f"{int(x.split('/')[1].split('.')[0]):03d}.jpg")
celeb_data['file_name'] = celeb_data['file_name'].apply(lambda x: f"{int(x.split('/')[1].split('.')[0]):03d}.jpg")

# Now check if file_name in all_celebs exists in celeb_data
all_celebs['augmented'] = all_celebs['file_name'].isin(celeb_data['file_name']).astype(int)

# Set augmented to 1 if file_name is NOT in celeb_data (0 if it exists)
all_celebs['augmented'] = all_celebs['augmented'].apply(lambda x: 1 if x == 0 else 0)

# Save the updated dataset (optional)
all_celebs.to_csv("/workspace/DS4002Project3/DATA/all_celebs_image_data.csv", index=False)



### Now, we want to merge the "all_celebs_image_data.csv" file (which contains complete data for all of the 400 images used for each celebrity) with "incorrect_guesses_epoch_100.csv", which contains a list of images that were incorrectly identified in the final epoch (epoch 100) of our testing phase.

In [12]:
# Load the datasets
all_celebs = pd.read_csv("/workspace/DS4002Project3/DATA/all_celebs_image_data.csv")
epoch_results = pd.read_csv("/workspace/DS4002Project3/OUTPUT/incorrect_guesses_epoch_100.csv")

# Extract the relevant portion of the file_path column in all_celebs
all_celebs['image_id'] = all_celebs['file_path'].str.extract(r'celebrities_all/(.+)')

# Merge the datasets on the extracted column
merged_data = pd.merge(epoch_results, all_celebs, left_on='filename', right_on='image_id')

# Save the merged dataset (optional)
merged_data.to_csv("/workspace/DS4002Project3/DATA/incorrect_guesses_data.csv", index=False)


### This final dataset "/workspace/DS4002Project3/DATA/incorrect_guesses_data.csv" contains all of the data for images that were incorrectly identified in the last tesing epoch (epoch 100).