In [None]:
#!/bin/python3
import os
import numpy as np
import pandas as pd
import shutil

from tqdm import tqdm
from PIL import UnidentifiedImageError

from sklearn.cluster import KMeans

# TensorFlow / Keras imports
import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# --------------------------------------------------------------------------------
# GLOBAL SETTINGS
# --------------------------------------------------------------------------------
presentation = "12-19-2024"

# Modify these paths to match your environment
DATA_DIR = f'/project/momen/milad/Presentation/{presentation}/PPI_pics_not_demeaned2/All_pics'
OUTPUT_DIR = f'/project/momen/milad/Presentation/{presentation}/PPI_pics_not_demeaned2'


# --------------------------------------------------------------------------------
# 1. BUILD & PARTIALLY UNFREEZE INCEPTIONV3 FOR 3-CLASS CLASSIFICATION
# --------------------------------------------------------------------------------
def build_inception_for_classification(num_unfreeze=20, learning_rate=1e-5):
    """
    Build an InceptionV3-based model for 3-class classification (neutral, stable, unstable).
    Partially unfreeze the last `num_unfreeze` layers for fine-tuning.
    """
    # Load the base model (no top) with ImageNet weights
    base_model = InceptionV3(weights='imagenet', include_top=False)

    # Freeze all but the last `num_unfreeze` layers
    for layer in base_model.layers[:-num_unfreeze]:
        layer.trainable = False
    for layer in base_model.layers[-num_unfreeze:]:
        layer.trainable = True

    # Add pooling and a final Dense layer for 3 classes
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    # For classification:
    x = Dense(3, activation='softmax')(x)  # 3 classes

    model = Model(inputs=base_model.input, outputs=x)
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model
# --------------------------------------------------------------------------------
# 2. TRAIN THE CLASSIFIER
# --------------------------------------------------------------------------------
def train_classifier(model, data_dir, target_size=(299, 299), batch_size=16, epochs=15):
    """
    Trains the model on images in `data_dir` using subfolders for each class.
    We also define a validation split here, for demonstration.
    """
    # Create an ImageDataGenerator with a validation split
    datagen = ImageDataGenerator(
        preprocessing_function=preprocess_input,
        validation_split=0.2  # 20% validation
    )

    # Training generator
    train_generator = datagen.flow_from_directory(
        directory=data_dir,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical',
        subset='training'
    )

    # Validation generator
    val_generator = datagen.flow_from_directory(
        directory=data_dir,
        target_size=target_size,
        batch_size=batch_size,
        class_mode='categorical',
        subset='validation'
    )

    # Fit the model
    model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=epochs
    )

    return model


# --------------------------------------------------------------------------------
# 3. FEATURE EXTRACTION (AFTER TRAINING) & CLUSTERING
# --------------------------------------------------------------------------------

def build_feature_extractor(trained_model):
    """
    Builds a new model that outputs the second-to-last layer (just before softmax).
    This gives us meaningful embeddings to cluster on.
    """
    # The -1 layer is the final Dense(3) for classification
    # The -2 layer is the GlobalAveragePooling2D
    feature_layer = trained_model.layers[-2].output
    feature_model = Model(inputs=trained_model.input, outputs=feature_layer)
    return feature_model


def extract_features_and_names(model, img_folder):
    """
    Extract features using the trained model (or feature_extractor).
    We expect subfolders in `img_folder`, but we can also load
    images from each subfolder individually.
    """
    features = []
    img_names = []

    # Traverse each class subfolder (neutral, stable, unstable)
    for class_folder in os.listdir(img_folder):
        class_dir = os.path.join(img_folder, class_folder)
        if not os.path.isdir(class_dir):
            continue

        # For each image in that class folder
        for img_file in tqdm(os.listdir(class_dir), desc=f'Loading {class_folder}'):
            fname = os.path.join(class_dir, img_file)
            try:
                # Load image and preprocess
                img = image.load_img(fname, target_size=(299, 299))
                x = img_to_array(img)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)

                # Extract feature
                feat = model.predict(x)
                feat = feat.flatten()

                features.append(feat)
                img_names.append((class_folder, img_file))

            except (UnidentifiedImageError, OSError) as e:
                print(f"Skipping corrupted image: {fname} - Error: {str(e)}")
                continue

    return np.array(features), img_names


# --------------------------------------------------------------------------------
# 4. MAIN SCRIPT
# --------------------------------------------------------------------------------
if __name__ == "__main__":
    # Step 1: Build the classification model
    clf_model = build_inception_for_classification(
        num_unfreeze=10,        # how many layers to unfreeze
        learning_rate=1e-4      # fine-tuning LR
    )

    # Step 2: Train the classification model
    #   This expects your /All_pics folder to have subfolders: 'neutral', 'stable', 'unstable'.
    #   Each subfolder should have the respective images.
    clf_model = train_classifier(
        model=clf_model,
        data_dir=DATA_DIR,
        target_size=(299, 299),
        batch_size=16,
        epochs=5
    )

    # Step 3: Build a feature extractor from the trained model
    #   This "chops off" the final softmax layer, returning feature vectors from the pooling layer.
    feature_extractor = build_feature_extractor(clf_model)

    # Step 4: Extract features for ALL images in the dataset
    #   We'll do it again from the same folder, but you could do a separate folder if you want.
    features, img_names = extract_features_and_names(feature_extractor, DATA_DIR)

    # Step 5: Perform KMeans clustering on those features
    for k in [3, 4, 5]:
        print(f"\nPerforming KMeans clustering with k={k}")
        kmeans = KMeans(n_clusters=k, random_state=40)
        kmeans.fit(features)
        labels = kmeans.labels_

        # Create a DataFrame: (class_folder, image_file, cluster)
        df = pd.DataFrame(img_names, columns=['class_folder', 'image_file'])
        df['clusterid'] = labels

        # For convenience, map cluster IDs to letters
        letters = [chr(ord('A') + i) for i in range(k)]
        cluster_dirs = {}

        # Create the base directory for k clusters
        k_clusters_dir = os.path.join(OUTPUT_DIR, f"{k}_clusters")
        os.makedirs(k_clusters_dir, exist_ok=True)

        for i, letter in enumerate(letters):
            dir_path = os.path.join(k_clusters_dir, letter)
            os.makedirs(dir_path, exist_ok=True)
            cluster_dirs[i] = dir_path

        # Copy images according to cluster
        for idx, row in df.iterrows():
            source_path = os.path.join(DATA_DIR, row['class_folder'], row['image_file'])
            cluster_id = row['clusterid']
            destination_dir = cluster_dirs[cluster_id]
            shutil.copy2(source_path, destination_dir)

        print(f"Finished clustering for k={k}. Check {k_clusters_dir} for results.")
