In [24]:
import os
import cv2
import numpy as np
import pandas as pd
import json
from PIL import Image
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models, callbacks
import tensorflow.keras.backend as K

In [25]:
gpu_ids = [4,5,6,7]  # Example: using 4 out of 7 GPUs (can customize which ones)
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in gpu_ids])

In [26]:
gpus = tf.config.list_physical_devices('GPU')
print(f"Available GPUs: {len(gpus)}")
print(f"Using GPUs with IDs: {gpu_ids}")

Available GPUs: 3
Using GPUs with IDs: [4, 5, 6, 7]


In [27]:
# Configure multi-GPU strategy
strategy = tf.distribute.MirroredStrategy()
print(f'Number of devices: {strategy.num_replicas_in_sync}')

# Set mixed precision policy for A100s
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2')
Number of devices: 3


In [28]:
BATCH_SIZE = 2 * strategy.num_replicas_in_sync
IMG_SIZE = (256, 256)

In [29]:
def load_and_preprocess_images(root_dir, target_size=(512, 512), test_size=0.2, val_size=0.1, batch_size=8):
    """
    Load and preprocess images from the specified directory structure.
    
    Args:
        root_dir (str): Path to the root directory containing fat percentage folders
        target_size (tuple): Target size for resizing images (height, width)
        test_size (float): Proportion of data for testing
        val_size (float): Proportion of training data for validation
        batch_size (int): Batch size for data generators
        
    Returns:
        Tuple of (train_gen, val_gen, test_gen, class_indices)
    """
    # Initialize lists to store images and labels
    images = []
    labels = []
    class_names = []
    class_indices = {}
    
    # Walk through directory structure
    for fat_dir in sorted(os.listdir(root_dir)):
        fat_path = os.path.join(root_dir, fat_dir)
        if not os.path.isdir(fat_path):
            continue
            
        for conc_dir in sorted(os.listdir(fat_path)):
            conc_path = os.path.join(fat_path, conc_dir)
            if not os.path.isdir(conc_path):
                continue
                
            # Handle no-adulteration case (0%)
            if conc_dir == '0':
                for img_file in os.listdir(conc_path):
                    if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                        img_path = os.path.join(conc_path, img_file)
                        label = 'no_adulteration'
                        
                        # Add to class indices if not present
                        if label not in class_indices:
                            class_indices[label] = len(class_indices)
                            class_names.append(label)
                            
                        images.append(img_path)
                        labels.append(class_indices[label])
            else:
                # Handle adulteration cases (5%, 10%, 15%)
                for adulterant_dir in os.listdir(conc_path):
                    adulterant_path = os.path.join(conc_path, adulterant_dir)
                    if not os.path.isdir(adulterant_path):
                        continue
                        
                    label = f"{adulterant_dir}_{conc_dir}%"
                    
                    # Add to class indices if not present
                    if label not in class_indices:
                        class_indices[label] = len(class_indices)
                        class_names.append(label)
                        
                    for img_file in os.listdir(adulterant_path):
                        if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                            img_path = os.path.join(adulterant_path, img_file)
                            images.append(img_path)
                            labels.append(class_indices[label])
    
    # Split into train, validation, and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        images, labels, test_size=test_size, stratify=labels, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=val_size, stratify=y_train, random_state=42
    )
    
    # Create data generators with preprocessing
    train_datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.1,
        height_shift_range=0.1,
        shear_range=0.1,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        fill_mode='nearest'
    )
    
    val_test_datagen = ImageDataGenerator(rescale=1./255)
    
    def create_generator(data_gen, X, y, shuffle=False):
        df = pd.DataFrame({'filename': X, 'class': y})
        return data_gen.flow_from_dataframe(
            dataframe=df,
            x_col='filename',
            y_col='class',
            target_size=target_size,
            batch_size=batch_size,
            class_mode='raw',
            shuffle=shuffle,
            color_mode='rgb'
        )
    
    train_gen = create_generator(train_datagen, X_train, y_train, shuffle=True)
    val_gen = create_generator(val_test_datagen, X_val, y_val)
    test_gen = create_generator(val_test_datagen, X_test, y_test)
    
    # Save class indices for reference
    with open('class_indices.json', 'w') as f:
        json.dump(class_indices, f)
    
    return train_gen, val_gen, test_gen, class_indices


def preprocess_single_image(image_path, target_size=(512, 512)):
    """
    Preprocess a single image for prediction.
    
    Args:
        image_path (str): Path to the image file
        target_size (tuple): Target size for resizing
        
    Returns:
        Preprocessed image as numpy array
    """
    # Load image
    img = Image.open(image_path)
    
    # Convert to RGB if not already
    if img.mode != 'RGB':
        img = img.convert('RGB')
    
    # Resize
    img = img.resize(target_size)
    
    # Convert to array and normalize
    img_array = np.array(img) / 255.0
    
    # Add batch dimension
    img_array = np.expand_dims(img_array, axis=0)
    
    return img_array


def patch_based_processing(image_path, patch_size=256, overlap=64):
    """
    Process large images by dividing into patches.
    
    Args:
        image_path (str): Path to the image file
        patch_size (int): Size of square patches
        overlap (int): Overlap between patches
        
    Returns:
        List of patches as numpy arrays
    """
    img = Image.open(image_path)
    if img.mode != 'RGB':
        img = img.convert('RGB')
    
    width, height = img.size
    patches = []
    
    # Calculate step size
    step = patch_size - overlap
    
    # Extract patches
    for y in range(0, height - overlap, step):
        for x in range(0, width - overlap, step):
            box = (x, y, x + patch_size, y + patch_size)
            patch = img.crop(box)
            patch_array = np.array(patch) / 255.0
            patches.append(patch_array)
    
    return patches


# if __name__ == "__main__":
#     # Example usage
#     root_directory = "/Users/sohail/Documents/Salmaan/ACPS_Lab/Project/dataset/RGB"
#     train_gen, val_gen, test_gen, class_indices = load_and_preprocess_images(root_directory)
    
#     print(f"Found {len(class_indices)} classes: {class_indices}")
#     print(f"Train batches: {len(train_gen)}")
#     print(f"Validation batches: {len(val_gen)}")
#     print(f"Test batches: {len(test_gen)}")

In [30]:
from tensorflow.keras import layers, models, callbacks
import matplotlib.pyplot as plt

In [31]:
root_directory = "/home/nitin/salmaan/RGB"
train_gen, val_gen, test_gen, class_indices = load_and_preprocess_images(
    root_directory,
    target_size=(256, 256),  # Reduced from 512x512
    batch_size=BATCH_SIZE
    )

print(f"Found {len(class_indices)} classes: {class_indices}")
print(f"Train batches: {len(train_gen)}")
print(f"Validation batches: {len(val_gen)}")
print(f"Test batches: {len(test_gen)}")

Found 775 validated image filenames.
Found 87 validated image filenames.
Found 216 validated image filenames.
Found 13 classes: {'no_adulteration': 0, 'Detergent_10%': 1, 'Shampoo_10%': 2, 'Water_10%': 3, 'StarchPowder_10%': 4, 'Detergent_15%': 5, 'Shampoo_15%': 6, 'Water_15%': 7, 'StarchPowder_15%': 8, 'Detergent_5%': 9, 'Shampoo_5%': 10, 'Water_5%': 11, 'StarchPowder_5%': 12}
Train batches: 130
Validation batches: 15
Test batches: 36


In [32]:
def create_model(num_classes=13):
    """Create model inside the strategy scope"""
    with strategy.scope():
        # Use EfficientNetB2 as compromise between size and performance
        model = models.Sequential([
        # Input layer
        layers.Input(shape=(256,256,3)),
        
        # First convolution block
        layers.Conv2D(32, (5, 5), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.2),
        
        # Second convolution block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.3),
        
        # Third convolution block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.3),
        
        # Fourth convolution block
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.4),
        
        # Classifier head
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    # Custom learning rate
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    
    model.compile(
        optimizer=optimizer,
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [33]:
callbacks = [
    callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True),
    callbacks.ModelCheckpoint('best_model_multi_gpu.h5', save_best_only=True),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5),
    callbacks.TerminateOnNaN(),
    callbacks.BackupAndRestore('backup')  # Important for multi-GPU training
]

In [34]:
with strategy.scope():
    model = create_model(num_classes=len(class_indices))
    model.summary()

    # Training
    history = model.fit(
        train_gen,
        epochs=5,
        validation_data=val_gen,
        callbacks=callbacks,
        verbose=1
    )

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

2025-04-04 08:14:37.832413: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorDataset/_1"
op: "TensorDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_INT32
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 1
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\017TensorDataset:0"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_INT32
        }
      }
    }
  }
}



Epoch 1/5
INFO:tensorflow:Collective all_reduce tensors: 22 all_reduces, num_devices = 3, group_size = 3, implementation = CommunicationImplementation.NCCL, num_packs = 1
INFO:tensorflow:Collective all_reduce tensors: 22 all_reduces, num_devices = 3, group_size = 3, implementation = CommunicationImplementation.NCCL, num_packs = 1


2025-04-04 08:14:51.703093: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2025-04-04 08:14:51.759369: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2025-04-04 08:14:51.777789: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2025-04-04 08:14:52.719807: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_2_bfc) ran out of memory trying to allocate 1.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-04-04 08:14:52.762181: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_1_bfc) ran out of memory trying to allocate 1.06GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2025-04-04 08:14:52.799803: W tensorflow

  6/130 [>.............................] - ETA: 4:10 - loss: 3.6437 - accuracy: 0.0556

2025-04-04 08:15:10.718919: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fcbd0005550 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-04 08:15:10.722764: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-04-04 08:15:10.722783: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (1): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-04-04 08:15:10.722793: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (2): NVIDIA A100-SXM4-40GB, Compute Capability 8.0
2025-04-04 08:15:10.970976: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-04 08:15:12.157637: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of th



2025-04-04 08:17:24.970439: W tensorflow/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.04GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.




2025-04-04 08:21:08.239643: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorDataset/_1"
op: "TensorDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_INT32
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 1
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\020TensorDataset:35"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_INT32
        }
      }
    }
  }
}

  saving_api.save_model(


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
# Evaluation
print("\nEvaluating on test set...")
test_loss, test_acc = model.evaluate(test_gen)
print(f"Test Accuracy: {test_acc:.4f}")

# Save class indices and results
with open('class_indices.json', 'w') as f:
    json.dump(class_indices, f)

with open('training_results.txt', 'w') as f:
    f.write(f"Test Accuracy: {test_acc:.4f}\n")
    f.write(f"Final Validation Accuracy: {history.history['val_accuracy'][-1]:.4f}\n")



Evaluating on test set...


2025-04-04 08:47:41.799326: W tensorflow/core/grappler/optimizers/data/auto_shard.cc:786] AUTO sharding policy will apply DATA sharding policy as it failed to apply FILE sharding policy because of the following reason: Found an unshardable source dataset: name: "TensorDataset/_1"
op: "TensorDataset"
input: "Placeholder/_0"
attr {
  key: "Toutput_types"
  value {
    list {
      type: DT_INT32
    }
  }
}
attr {
  key: "_cardinality"
  value {
    i: 1
  }
}
attr {
  key: "metadata"
  value {
    s: "\n\021TensorDataset:222"
  }
}
attr {
  key: "output_shapes"
  value {
    list {
      shape {
      }
    }
  }
}
experimental_type {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_DATASET
    args {
      type_id: TFT_PRODUCT
      args {
        type_id: TFT_TENSOR
        args {
          type_id: TFT_INT32
        }
      }
    }
  }
}



Test Accuracy: 0.0926
