In [1]:
import os
import shutil
import pandas as pd
from PIL import Image


In [2]:
# Create a folder for clean data
clean_data_folder = 'cleandata'
os.makedirs(clean_data_folder, exist_ok=True)

# Create a folder for images within clean data folder
images_folder = os.path.join(clean_data_folder, 'images')
os.makedirs(images_folder, exist_ok=True)


In [3]:
# Load CSV file
csv_file = 'images.csv'
data = pd.read_csv(csv_file)

# List of valid image names from the CSV file
valid_image_names = data['image'].tolist()

# Loop through images_compressed folder and copy valid images
source_folder = 'images_compressed'

for filename in os.listdir(source_folder):
    if filename.endswith('.jpg') and filename[:-4] in valid_image_names:
        try:
            # Attempt to open the image
            img = Image.open(os.path.join(source_folder, filename))
            # Save the image to the new 'images' folder
            img.save(os.path.join(images_folder, filename))
        except (OSError, FileNotFoundError, Image.DecompressionBombError):
            # Skip corrupted images or those that cannot be opened
            pass


In [4]:
# Filter the DataFrame to keep rows with valid image names
clean_data = data[data['image'].isin(valid_image_names)]

# Save the cleaned CSV data to a new CSV file in the cleandata folder
clean_csv_file = os.path.join(clean_data_folder, 'data.csv')
clean_data.to_csv(clean_csv_file, index=False)


In [7]:
# List all image filenames in the 'images' folder
images_in_folder = os.listdir(images_folder)

# Extract image filenames without extension
image_filenames_in_folder = [filename[:-4] for filename in images_in_folder if filename.endswith('.jpg')]

# Get image filenames from the CSV file
image_filenames_in_csv = clean_data['image'].tolist()

# Find inconsistencies
missing_in_folder = [filename for filename in image_filenames_in_csv if filename not in image_filenames_in_folder]
missing_in_csv = [filename for filename in image_filenames_in_folder if filename not in image_filenames_in_csv]

# Print the inconsistencies
print(f"Images missing in 'images' folder but present in CSV: {missing_in_folder}")
print(f"Images missing in CSV but present in 'images' folder: {missing_in_csv}")


Images missing in 'images' folder but present in CSV: []
Images missing in CSV but present in 'images' folder: []


In [8]:
# Remove rows corresponding to missing images from the CSV DataFrame
clean_data = clean_data[~clean_data['image'].isin(missing_in_folder)]

# Save the updated CSV data to a new CSV file
clean_csv_file_updated = os.path.join(clean_data_folder, 'data_updated.csv')
clean_data.to_csv(clean_csv_file_updated, index=False)
