In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path

In [None]:
img_filepath = "/content/drive/MyDrive/BTT Skinterest 2A/Dataset/images"
csv_filepath = "/content/drive/MyDrive/BTT Skinterest 2A/Dataset/SCIN Working Merged Spreadsheet.csv"

In [None]:
csv_df = pd.read_csv(csv_filepath)
csv_df.head()

Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,...,related_category,condition_duration,image_1_path,image_2_path,image_3_path,image_1_shot_type,image_2_shot_type,image_3_shot_type,combined_race,race_ethnicity_two_or_more_after_mitigation
0,-1.26295e+18,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,FST3,0,0,...,,,dataset/images/-5949315841433628424.png,dataset/images/-8183947049312687778.png,dataset/images/4923188439600899486.png,AT_DISTANCE,CLOSE_UP,AT_AN_ANGLE,,
1,-1.48955e+18,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,FST2,0,0,...,,,dataset/images/325464533153467313.png,,,CLOSE_UP,,,,
2,-1.49232e+18,SCIN,1.0.0,2023,AGE_18_TO_29,FEMALE,NONE_IDENTIFIED,FST2,FST2,FST2,...,LOOKS_HEALTHY,ONE_TO_FOUR_WEEKS,dataset/images/-6837240536182868524.png,dataset/images/6395257111195214043.png,dataset/images/7877276387406078156.png,AT_DISTANCE,CLOSE_UP,AT_AN_ANGLE,HISPANIC_LATINO_OR_SPANISH_ORIGIN,
3,-1.5809e+17,SCIN,1.0.0,2023,AGE_18_TO_29,FEMALE,FST3,FST3,0,0,...,RASH,ONE_TO_FOUR_WEEKS,dataset/images/2983323875335943836.png,,,CLOSE_UP,,,WHITE,
4,-1.88763e+18,SCIN,1.0.0,2023,AGE_30_TO_39,MALE,FST3,FST2,0,0,...,OTHER_ISSUE_DESCRIPTION,LESS_THAN_ONE_WEEK,dataset/images/3104801012387799539.png,dataset/images/8660513260658813359.png,,CLOSE_UP,AT_AN_ANGLE,,WHITE,


# Data Analysis and Data Cleaning

In [None]:
# count all duplicates
dup_count = csv_df.duplicated().sum()
print(f"Number of fully duplicate rows: {dup_count}")
dup_rows = csv_df[csv_df.duplicated(keep=False)]
print(dup_rows)

Number of fully duplicate rows: 0
Empty DataFrame
Columns: [case_id, source, release, year, age_group, sex_at_birth, fitzpatrick_skin_type, dermatologist_fitzpatrick_skin_type_label_1, dermatologist_fitzpatrick_skin_type_label_2, dermatologist_fitzpatrick_skin_type_label_3, monk_skin_tone_label_india, monk_skin_tone_label_us, dermatologist_skin_condition_on_label_name, dermatologist_skin_condition_confidence, race_ethnicity_american_indian_or_alaska_native, race_ethnicity_asian, race_ethnicity_black_or_african_american, race_ethnicity_hispanic_latino_or_spanish_origin, race_ethnicity_middle_eastern_or_north_african, race_ethnicity_native_hawaiian_or_pacific_islander, race_ethnicity_white, race_ethnicity_other_race, race_ethnicity_prefer_not_to_answer, textures_raised_or_bumpy, textures_flat, textures_rough_or_flaky, textures_fluid_filled, body_parts_head_or_neck, body_parts_arm, body_parts_palm, body_parts_back_of_hand, body_parts_torso_front, body_parts_torso_back, body_parts_genitali

In [None]:
image_cols = ["image_1_path","image_2_path","image_3_path"]

all_files = []

for col in image_cols:
    # take values from this column, drop blanks/NaN,
    # convert each path to just the filename (e.g. "img1.jpg")
    col_files = csv_df[col].dropna().map(lambda x: Path(str(x)).name)
    all_files.extend(col_files)

# count how many times each filename appears across ALL columns/rows
counts = Counter(all_files)

# keep only those filenames that appear more than once (duplicates)
dups = {k:v for k,v in counts.items() if v > 1}
print(f"Duplicate filenames: {len(dups)}")

# show up to the first 15 duplicate filenames and how many times each appears
for k,v in list(dups.items())[:15]:
    print(f"{k} → {v} times")


Duplicate filenames: 0


In [None]:
# build a mapping of filename -> list of (row_index, case_id, column)
rows = []
for i, row in csv_df.iterrows():
    for col in image_cols:
        val = row[col]
        if pd.notna(val) and str(val).strip():
            rows.append({
                "filename": Path(str(val)).name,
                "row_index": i,
                "case_id": row["case_id"],
                "column": col
            })

img_map = pd.DataFrame(rows)

# count in how many different rows each filename is used
dup_across_rows = (
    img_map.groupby("filename")["row_index"]
    .nunique()
    .reset_index(name="unique_rows")
    .query("unique_rows > 1")
)

print("Filenames used in more than one row:", len(dup_across_rows))
display(dup_across_rows.head(20))


Filenames used in more than one row: 0


Unnamed: 0,filename,unique_rows


In [None]:
# gather all filenames actually on disk
disk_files = {p.name for p in Path(img_filepath).rglob("*") if p.is_file()}

for col in image_cols:
    col_files = csv_df[col].dropna().map(lambda x: Path(str(x)).name)
    missing = sorted(set(col_files) - disk_files)
    print(f"{col}: {len(missing)} missing")
    if missing:
        print(missing[:10])  # preview first 10 missing filenames


image_1_path: 235 missing
['-8555840247489065545.png', '-8579367048811032343.png', '-8594260406533869498.png', '-8622301865890936410.png', '-862558295135857219.png', '-8653997701010841176.png', '-8657936158769470647.png', '-8668488196925068035.png', '-8687142521108777184.png', '-870319501963618397.png']
image_2_path: 323 missing
['-8539116180751237612.png', '-8541325890788453938.png', '-8604333316388588792.png', '-8678220100544476540.png', '-8682618672042721592.png', '-8687750677002784717.png', '-8696744820413574401.png', '-8745726265111908053.png', '-8755999729037851117.png', '-8769950017377673948.png']
image_3_path: 362 missing
['-8591174633182809752.png', '-8715046487528210051.png', '-8723427709388330837.png', '-8753984834834986680.png', '-8811309900087453061.png', '-8857891249412069725.png', '-8970062608215251005.png', '-9005628762171642786.png', '-9041288694268048673.png', '-9064705102210026712.png']


### **Method 1: Pillow Library**
https://pillow.readthedocs.io/en/stable/handbook/overview.html

This script scans a folder of images and reports which files are corrupted based on whether Pillow can successfully open and fully decode them.

- Uses Pillow (PIL) to open each file.
- `img.verify()` checks the image header for validity.
- `img.load()` forces the program to fully read pixel data,
  catching cases where an image looks valid but is actually truncated or broken.
- Corrupted files are collected into a list and printed at the end.
- The original dataset is not modified.


In [None]:
!pip install pillow

**Method 1: Pillow Library**
https://pillow.readthedocs.io/en/stable/handbook/overview.html

This script scans a folder of images and reports which files are corrupted based on whether Pillow can successfully open and fully decode them.

- Uses Pillow (PIL) to open each file.
- `img.verify()` checks the image header for validity.
- `img.load()` forces the program to fully read pixel data,
  catching cases where an image looks valid but is actually truncated or broken.
- Corrupted files are collected into a list and printed at the end.
- The original dataset is not modified.


In [None]:
import os
from PIL import Image

def check_corrupted_image(file_path):
    try:
        with Image.open(file_path) as img:
            img.verify()   # Header check
        with Image.open(file_path) as img:
            img.load()     # Force pixel data load
        return False
    except Exception as e:
        print(f"Corrupted image: {file_path} - {e}")
        return True

def list_corrupted_images(folder_path):
    corrupted_files = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path):
            try:
                if check_corrupted_image(file_path):
                    corrupted_files.append(file_path)
            except Exception as e:
                # Catch anything unexpected
                print(f"Error checking {file_path}: {e}")
                corrupted_files.append(file_path)
    return corrupted_files

# Example usage:
corrupted = list_corrupted_images(img_filepath)
print(f"Checked {len(os.listdir(img_filepath))} files")
print("Corrupted images:")
for f in corrupted:
    print(f)


There is nothing printed which means there are no corrupted files that exist within the dataset

### Method 2: Cleanvision library

https://github.com/cleanlab/cleanvision

This script uses the CleanVision library to automatically audit an image dataset. It scans all images in the given folder for common issues (e.g., blur, duplicates,brightness problems, resolution outliers) and produces a report.

In [None]:
!pip install cleanvision

In [None]:

from cleanvision import Imagelab

# Specify path to folder containing the image files in your dataset
imagelab = Imagelab(data_path=img_filepath)

# Automatically check for a predefined list of issues within your dataset
imagelab.find_issues()

# Produce a neat report of the issues found in your dataset
imagelab.report()


In [None]:
# Filter only blurry images, sorted by blur score
blurry_images = imagelab.issues[imagelab.issues["is_blurry_issue"] == True].sort_values(
    by=["blurry_score"]
)

# Print filename + blur score
for file, row in blurry_images.iterrows():
    print(f"{file}: {row['blurry_score']}")


In [None]:
# Get top 10 blurry image file paths
top_blurry_files = blurry_images.index[10:].tolist()

# Visualize those images
imagelab.visualize(image_files=top_blurry_files)


In [None]:
# Find most blurry image (lowest score)
most_blurry_file = blurry_images.index[0]  # filename
most_blurry_score = blurry_images.iloc[0]["blurry_score"]
print("Most blurry image:", most_blurry_file, "with score:", most_blurry_score)

imagelab.visualize(image_files=[most_blurry_file])  # pass as list

# Find least blurry image (lowest score among flagged)
least_blurry_file = blurry_images.index[-1]  # filename
least_blurry_score = blurry_images.iloc[-1]["blurry_score"]
print("Least blurry image:", least_blurry_file, "with score:", least_blurry_score)

imagelab.visualize(image_files=[least_blurry_file])  # pass as list


According to the analysis of our dataset, we currently have 57 blurry images, 4 odd size, 1 low_information, and 1 odd_ascpect_ratio image within our dataset.

The most problematic images we may have to remove are the blurry images, but an threshold needs to be established for these images.


### OpenCV

The Laplacian blur detection works by:

- Converting the image to grayscale.
- Applying the Laplacian operator to detect edges.
- Measuring the variance of those edge values.
- High variance → many strong edges → image is sharp.
- Low variance → weak edges → image is blurry.
- Thresholding the variance decides if the image is blurry or not.

In [None]:
import os
import cv2
from pathlib import Path

def blur_score(image):
    """
    Compute Laplacian variance (focus measure).
    Higher variance = sharper, lower = blurrier.
    """
    if image is None:
        return -1.0
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    lap = cv2.Laplacian(gray, cv2.CV_64F)
    variance = lap.var()

    return variance


def scan_all_images(root_dir,
                    exts={".jpg",".jpeg",".png",".bmp",".tif",".tiff",".webp"}):
    results = []
    root = Path(root_dir)
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            img = cv2.imread(str(p))
            var = blur_score(img)
            results.append({"path": str(p), "variance": var})
    results.sort(key=lambda d: d["variance"])
    return results

In [None]:
results = scan_all_images(img_filepath)

print(f"Scanned {len(results)} images")
results.sort(key=lambda d: d["variance"])
for r in results:
    print(f"{r['path']}: {r['variance']}")

In [None]:
for i, r in enumerate(results[:10]):
    img = cv2.imread(r["path"])
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

    plt.figure(figsize=(5,5))
    plt.imshow(img_rgb)
    plt.title(f"{i+1}. {r['path']} | variance={r['variance']:.2f}")
    plt.axis("off")
    plt.show()


### Result Comparisons

Imagelab vs. OpenCV

- *Imagelab blur detection* gives an comprehensive report of the image dataset in terms of blurs, duplicates, brightness, and outliers. Their results are more nuanced and uses additional built in signals beyond edge variance, so it can flag blur that comes from poor focus, motion, or even lighting conditions that reduce detail visibility.

- *OpenCV blur detection* method uses laplacian varience and only looks at sharpness and edges to determine blur score. If edges are weak (low variance), it calls the image blurry. It does not consider lighting, exposure, or contrast. (Need to pick an threshold to determine what is blurry and what is not. One method is using mean of all images)


Pros vs. Cons
| Method                | Pros                                                                 | Cons                                                                 |
|------------------------|----------------------------------------------------------------------|----------------------------------------------------------------------|
| **OpenCV (Laplacian Variance)** | - Simple & very fast  <br> - Easy to interpret (variance = sharpness) <br> - Good at catching obvious motion/defocus blur | - Sensitive to lighting/contrast <br> - May flag smooth skin as blurry <br> - No awareness of lesion region importance |
| **Cleanvision / Imagelab**      | - Considers sharpness *and* lighting/exposure <br> - Handles subtle dermatology images better <br> - Integrates with other quality checks (odd size, brightness, etc.) | - Slower on large datasets <br> - Blur score less transparent <br> - Needs threshold tuning + manual spot checks |


According to **Quantifying acceptable artefact ranges for dermatologic classification algorithms**, general blur simulated to look like blurring caused by an object being outside the depth of field or by motion of the camera along its optical axis is the most dangerous for dermatology classification tasks. General blur makes the image appear less sharp and detailed.

To help combat this and determine which images to remove suggestion:
1. Start with Imagelab results and remove them as seen fit
2. Use OpenCV Laplacian variance as a secondary check for extreme motion blur or focus issues
3. Manual Validation

## Note:

1. need to detect for extreme lightings w/OpenCV and compare those results
2. Need to Determine which images should be cleared from data + how to clean up data file

So far:
- determine if images repeated within file rows/columns exactly (none)
- not sure what are the 15 known duplucates and missing
- only analyzed dataset for blur & sharpness
