In [1]:
# Define the image directory path
image_directory = '/Users/…'

# Define number of clusters (i.e. folders or buckets)
k = 20

In [2]:
import os
import subprocess
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import shutil
import re


image_data = {}

if os.path.isdir(image_directory):
    for image_file in os.listdir(image_directory):
        if image_file.endswith(('.jpg', '.jpeg', '.png')):
            image_path = os.path.join(image_directory, image_file)

            # Extract keywords using exiftool
            exiftool_command = f'exiftool -XMP:Subject -S -s "{image_path}"'

            # Execute exiftool command and capture the output
            exiftool_process = subprocess.Popen(exiftool_command, stdout=subprocess.PIPE, shell=True)
            exiftool_output = exiftool_process.stdout.read().decode('utf-8').strip()

            if exiftool_output:
                image_data[image_path] = exiftool_output
            else:
                print(f"Error: File is empty - {image_path}")

    if image_data:
         # Create TF-IDF matrix from keywords
        keywords = list(image_data.values())

        # Custom tokenizer that treats multi-word tags as single entities without splitting them further
        def custom_tokenizer(text):
            split_by_semicolon = [phrase.strip() for phrase in text.split(';')]
            return split_by_semicolon

        # Create a custom TfidfVectorizer with the custom tokenizer
        vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer)
        tfidf_matrix = vectorizer.fit_transform(keywords)

        # Apply k-means clustering
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(tfidf_matrix)

        # Create directories for each cluster
        for cluster_num in range(k):
            cluster_dir = os.path.join(image_directory, f'Cluster_{cluster_num}')
            os.makedirs(cluster_dir, exist_ok=True)

       # Copy images to their respective cluster folders
        for i, (image_path, keyword) in enumerate(image_data.items()):
            cluster = kmeans.labels_[i]
            cluster_dir = os.path.join(image_directory, f'Cluster_{cluster}')
            image_name = os.path.basename(image_path)
            new_image_path = os.path.join(cluster_dir, image_name)
            shutil.copy(image_path, new_image_path)
        
    else:
        print("No valid images with keywords found.")
else:
    print(f"Error: Directory {image_directory} not found.")

Error: File is empty - /Users/benmunson/Downloads/90s Art School Copy/3275231818248093226.jpg


Error: File is empty - /Users/benmunson/Downloads/90s Art School Copy/3275231818248093226.jpg


Error: File is empty - /Users/benmunson/Downloads/90s Art School Copy/3275231818248093226 2.jpg


Error: File is empty - /Users/benmunson/Downloads/90s Art School Copy/3275231818248093226 2.jpg




In [3]:
# Get cluster centroids
centroids = kmeans.cluster_centers_

# Print cluster centroids
print("Cluster Centroids:")
for i, centroid in enumerate(centroids):

    # Find the top keywords for each cluster based on the centroid
    feature_names = vectorizer.get_feature_names_out()
    num_top_keywords = 10  # Number of top keywords to display

for i, centroid in enumerate(centroids):
    top_keyword_indices = centroid.argsort()[-num_top_keywords:][::-1]
    top_keywords = [feature_names[ind] for ind in top_keyword_indices]
    
# Define the path for the output text file within the image_directory
output_file_path = os.path.join(image_directory, "cluster_top_keywords.txt")

# Write the top keywords information to the text file
with open(output_file_path, 'w') as file:
    for i, centroid in enumerate(centroids):
        top_keyword_indices = centroid.argsort()[-num_top_keywords:][::-1]  # Select the top three keywords
        top_keywords = [feature_names[ind] for ind in top_keyword_indices]
        
        file.write(f"Top keywords for Cluster {i + 1}:\n")
        for j, keyword in enumerate(top_keywords):
            file.write(f"{j+1}. {keyword}\n")
        file.write("\n")

print(f"Top keywords information saved to {output_file_path} in the main folder.")

Cluster Centroids:
Top keywords information saved to /Users/benmunson/Downloads/90s Art School Copy/cluster_top_keywords.txt in the main folder.
