In [None]:
import os
from tensorflow.keras.preprocessing.image import load_img, img_to_array, save_img
from tqdm import tqdm  # For progress bar

input_dir = '../datathon-fme-mango/archive/images/images'
clean_dir = './cleaned_images'
resized_dir = './resized_images'

# Create directories for cleaned and resized images
os.makedirs(clean_dir, exist_ok=True)
os.makedirs(resized_dir, exist_ok=True)

# Initialize counters
corrupted_count = 0
cleaned_count = 0
resized_count = 0

# Clean and resize the dataset
print("Cleaning and resizing images...")

for item in tqdm(os.listdir(input_dir), desc="Processing images"):
    item_path = os.path.join(input_dir, item)
    
    # Check if the item is a file and has a valid image extension
    if os.path.isfile(item_path) and item_path.lower().endswith(('.jpg', '.jpeg', '.png')):
        try:
            # Attempt to load the image
            img = load_img(item_path)

            # If successful, move the image to the cleaned directory
            clean_path = os.path.join(clean_dir, item)
            os.rename(item_path, clean_path)
            cleaned_count += 1

            # Resize the image to 224x224
            img_resized = img.resize((224, 224))
            img_array = img_to_array(img_resized) / 255.0  # Normalize pixel values to 0-1

            # Save the resized image to the resized directory
            resized_path = os.path.join(resized_dir, item)
            save_img(resized_path, img_array)
            resized_count += 1

        except Exception as e:
            corrupted_count += 1
            print(f"Corrupted image detected and skipped: {item_path} - Error: {e}")
    else:
        print(f"Skipping non-image file: {item_path}")

# Summary
print(f"\nCleaning and resizing complete.")
print(f"Total images processed: {cleaned_count + corrupted_count}")
print(f"Valid images cleaned: {cleaned_count}")
print(f"Corrupted images removed: {corrupted_count}")
print(f"Images resized: {resized_count}")

^ Resize images (to 224x224)

In [None]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tqdm import tqdm

# Paths
RESIZED_DIR = './resized_images'
OUTPUT_EMBEDDINGS_PATH = './image_embeddings.npy'
OUTPUT_FILENAMES_PATH = './image_filenames.npy'
OUTPUT_CSV_PATH = './image_embeddings.csv'

# Parameters
TARGET_SIZE = (224, 224)  # ResNet50 requires 224x224 images
BATCH_SIZE = 1000  # Process images in batches to save memory

# Load the ResNet50 model
print("Loading pre-trained ResNet50 model...")
model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# List all valid image files in the resized directory
print(f"Scanning directory: {RESIZED_DIR}")
image_files = [f for f in os.listdir(RESIZED_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
total_images = len(image_files)
print(f"Total valid images found: {total_images}")

# Initialize storage for embeddings and filenames
all_embeddings = []
all_filenames = []

# Batch processing
print("Starting embedding extraction...")
for i in tqdm(range(0, total_images, BATCH_SIZE), desc="Processing batches"):
    batch_files = image_files[i:i + BATCH_SIZE]
    batch_images = []
    valid_files = []

    # Load and preprocess images in the batch
    for filename in batch_files:
        image_path = os.path.join(RESIZED_DIR, filename)
        try:
            img = load_img(image_path, target_size=TARGET_SIZE)
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
            batch_images.append(img_array)
            valid_files.append(filename)
        except Exception as e:
            print(f"Error processing {image_path}: {e}")

    if batch_images:
        # Convert batch to NumPy array and extract embeddings
        batch_images_np = np.array(batch_images)
        batch_embeddings = model.predict(batch_images_np)
        all_embeddings.append(batch_embeddings)
        all_filenames.extend(valid_files)

# Concatenate all batches into a single array
all_embeddings = np.vstack(all_embeddings)
print(f"Embeddings shape: {all_embeddings.shape}")
print(f"Total filenames recorded: {len(all_filenames)}")

# Save embeddings and filenames to .npy files
np.save(OUTPUT_EMBEDDINGS_PATH, all_embeddings)
np.save(OUTPUT_FILENAMES_PATH, np.array(all_filenames))
print(f"Saved embeddings to {OUTPUT_EMBEDDINGS_PATH}")
print(f"Saved filenames to {OUTPUT_FILENAMES_PATH}")

# Optionally, save embeddings and filenames as a CSV
print("Saving embeddings and filenames to CSV...")
embeddings_df = pd.DataFrame(all_embeddings, columns=[f'embedding_{i}' for i in range(all_embeddings.shape[1])])
embeddings_df['filename'] = all_filenames
embeddings_df.to_csv(OUTPUT_CSV_PATH, index=False)
print(f"Saved CSV to {OUTPUT_CSV_PATH}")

print("Embedding extraction complete!")

^ Create embeddings for each resized image (each one with 2048 Dimensions, thanks to ResNet50). Each dimension is in a column

In [None]:
import pandas as pd

# Load the existing embeddings CSV
embeddings_df = pd.read_csv(OUTPUT_CSV_PATH)

# Create a new DataFrame with two columns: 'filename' and 'embeddings'
one_dimensional_df = pd.DataFrame({
    'filename': embeddings_df['filename'],
    'embeddings': embeddings_df.drop(columns=['filename']).apply(lambda row: row.tolist(), axis=1)
})

# Save the new DataFrame to a CSV file
one_dimensional_df.to_csv('oneDimensionalEmbeddings.csv', index=False)

^ Join all dimensions into 1 single array, leaving 2 columns (one for filename and the other for the embeddings(with 2048 dimensions))

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Load the 2048D embeddings from CSV
df = pd.read_csv('2048DimensionalEmbeddings.csv')

# Function to clean up the embedding (from string to numpy array)
def clean_embeddings(embedding_str):
    return np.array(eval(embedding_str))

# Apply PCA and save the reduced embeddings to different files
def reduce_and_save(df, n_components, output_file):
    # Convert string embeddings to numpy arrays
    embeddings = np.array([clean_embeddings(e) for e in df['embeddings']])
    
    # Apply PCA for dimensionality reduction
    pca = PCA(n_components=n_components)
    reduced_embeddings = pca.fit_transform(embeddings)
    
    # Create a DataFrame with reduced embeddings
    reduced_df = pd.DataFrame(reduced_embeddings, columns=[f'embeddings_{i+1}' for i in range(n_components)])
    
    # Add the filename column to the reduced DataFrame
    reduced_df['filename'] = df['filename']
    
    # Save the reduced DataFrame to a CSV file
    reduced_df.to_csv(output_file, index=False)
    print(f"Saved {n_components} -dimensional embeddings to {output_file}")

# Reduce to different dimensions and save
reduce_and_save(df, 512, '512DimensionalEmbedding.csv')
reduce_and_save(df, 256, '256DimensionalEmbedding.csv')
reduce_and_save(df, 128, '128DimensionalEmbedding.csv')
reduce_and_save(df, 64, '64DimensionalEmbedding.csv')


^ Create 4 different versions with less dimensions (for faster processing)