In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def load_labels(file_path):
    """Load labels from an Excel file."""
    try:
        df = pd.read_excel(file_path)
        print("Dataset Loaded Successfully!")
        print(df.head())
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

# Load dataset
labels_df = load_labels("/mnt/data/labels..xlsx")
if labels_df is None:
    raise FileNotFoundError("Labels file is missing! Check file path.")


Dataset Loaded Successfully!
   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0      family          genus  \
0             0         68711       70465   Veneridae  Hysteroconcha   
1             1          6018        6162   Cardiidae   Acrosterigma   
2             2         42500       44103  Pectinidae   Decatopecten   
3             3         32647       34008   Mytilidae   Brachidontes   
4             4         11386       11538  Carditidae        Cardita   

                  species                                           filename  \
0  Hysteroconchalupanaria  Veneridae_Hysteroconcha_lupanaria_allspira_26-...   
1  Acrosterigmaattenuatum    Cardiidae_Acrosterigma_attenuatum_bigai_010.jpg   
2    Decatopectenamiculum  Pectinidae_Decatopecten_amiculum_Poppe_263539-...   
3  Brachidontessemilaevis  Mytilidae_Brachidontes_semilaevis_Poppe_284926...   
4     Carditaplanicostata  Carditidae_Cardita_planicostata_PRI_132-view2-...   

  ansicht      order          subclass  ... family_idx  g

In [4]:
def find_rare_classes(df, level, threshold=10):
    """Find rare classes in a given taxonomic level."""
    class_counts = df[level].value_counts()
    rare_classes = class_counts[class_counts < threshold].index
    return list(rare_classes)

# Identify rare classes
rare_classes = {level: find_rare_classes(labels_df, level) for level in ["family", "order", "subclass"]}
print("Rare classes:", rare_classes)




Rare classes: {'family': ['Cyamiidae', 'Cleidothaeridae'], 'order': ['Cyamioidea'], 'subclass': []}


In [5]:
# Print the number of samples for each family
family_counts = labels_df['family'].value_counts()
print("\nNumber of samples for each family:")
for family, count in family_counts.items():
    print(f"Family: {family}, Count: {count}")


Number of samples for each family:
Family: Pectinidae, Count: 9293
Family: Cardiidae, Count: 5390
Family: Tellinidae, Count: 5330
Family: Veneridae, Count: 4725
Family: Mytilidae, Count: 4141
Family: Arcidae, Count: 4038
Family: Lucinidae, Count: 3140
Family: Mactridae, Count: 2408
Family: Spondylidae, Count: 2033
Family: Glycymerididae, Count: 1999
Family: Carditidae, Count: 1707
Family: Limidae, Count: 1552
Family: Donacidae, Count: 1518
Family: Psammobiidae, Count: 1459
Family: Nuculanidae, Count: 1164
Family: Semelidae, Count: 1136
Family: Ostreidae, Count: 1117
Family: Nuculidae, Count: 1055
Family: Cuspidariidae, Count: 978
Family: Corbulidae, Count: 946
Family: Lasaeidae, Count: 944
Family: Propeamussiidae, Count: 931
Family: Chamidae, Count: 834
Family: Pinnidae, Count: 824
Family: Pharidae, Count: 799
Family: Anomiidae, Count: 761
Family: Pholadidae, Count: 757
Family: Astartidae, Count: 665
Family: Yoldiidae, Count: 663
Family: Crassatellidae, Count: 573
Family: Glossidae, C

In [6]:
# Print the number of samples for each family
order_counts = labels_df['order'].value_counts()
print("\nNumber of samples for each order:")
for order, count in order_counts.items():
    print(f"Order: {order}, Count: {count}")


Number of samples for each order:
Order: Cardiida, Count: 15237
Order: Pectinida, Count: 13344
Order: Venerida, Count: 9826
Order: Arcida, Count: 6826
Order: Mytilida, Count: 4141
Order: Lucinida, Count: 3605
Order: Ostreida, Count: 3282
Order: Carditida, Count: 2945
Order: Myida, Count: 2275
Order: Nuculanida, Count: 1996
Order: Adapedonta, Count: 1577
Order: Limida, Count: 1552
Order: Galeommatida, Count: 1144
Order: Nuculida, Count: 1055
Order: Cuspidarioidea, Count: 978
Order: Thracioidea, Count: 536
Order: Pandoroidea, Count: 526
Order: Solemyida, Count: 222
Order: Laternuloidea, Count: 193
Order: Poromyoidea, Count: 163
Order: Myochamoidea, Count: 123
Order: Gaimardioidea, Count: 110
Order: Verticordioidea, Count: 87
Order: Gastrochaenida, Count: 71
Order: Trigoniida, Count: 67
Order: Cyamioidea, Count: 7


In [7]:
# Print the number of samples for each family
subclass_counts = labels_df['subclass'].value_counts()
print("\nNumber of samples for each subclass:")
for subclass, count in subclass_counts.items():
    print(f"Subclass: {subclass}, Count: {count}")


Number of samples for each subclass:
Subclass: Imparidentia, Count: 33852
Subclass: Pteriomorphia, Count: 29145
Subclass: Protobranchia, Count: 3273
Subclass: Archiheterodonta, Count: 2945
Subclass: Anomalodesmata, Count: 2606
Subclass: Paleoheterodonta, Count: 67


In [6]:
def build_augmentation_model(shape=(224, 224, 3)):



    augmentation_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=shape),
        tf.keras.layers.RandomFlip("horizontal_and_vertical"),
        tf.keras.layers.RandomRotation(0.2),
        tf.keras.layers.RandomZoom(0.2),
        tf.keras.layers.RandomBrightness(0.1, value_range=[0, 255]),
        tf.keras.layers.RandomContrast(0.2),
        tf.keras.layers.GaussianNoise(0.02),
        tf.keras.layers.RandomTranslation(0.1, 0.1),
        tf.keras.layers.RandomCrop(height=shape[0], width=shape[1]),
        tf.keras.layers.Lambda(lambda x: x, output_shape=shape)
    ])
    return augmentation_model


In [7]:
augmentation_model = build_augmentation_model(shape=(224, 224, 3))

In [8]:
IMG_SIZE = (224, 224)

def augment_image(image_path, augmentation_model):
    """Apply augmentation to an image."""


    img = cv2.imread(image_path)
    img = cv2.resize(img, IMG_SIZE)
    img = tf.convert_to_tensor(img, dtype=tf.float32)

    img = augmentation_model(tf.expand_dims(img, axis=0))

    return tf.squeeze(img).numpy()

def augment_dataset(df, image_dir, augmentation_model, target_samples=5):
    """Augment images for rare classes."""
    augmented_samples = []

    for level in ["subclass"]:
        for cls in find_rare_classes(df, level):
            class_samples = df[df[level] == cls]
            num_samples = len(class_samples)

            for i in range(target_samples - num_samples):
                sample = class_samples.sample(1).iloc[0]
                img_path = os.path.join(image_dir, sample["filename"])

                aug_img = augment_image(img_path, augmentation_model)
                if aug_img is not None:
                    aug_filename = f"aug_{i}_{sample['filename']}"
                    aug_path = os.path.join(image_dir, aug_filename)
                    cv2.imwrite(aug_path, cv2.cvtColor(aug_img, cv2.COLOR_RGB2BGR))

                    new_sample = sample.copy()
                    new_sample["filename"] = aug_filename
                    augmented_samples.append(new_sample)

    return pd.concat([df, pd.DataFrame(augmented_samples)], ignore_index=True)


labels_df = augment_dataset(labels_df, "/content/drive/MyDrive/IMAGES_final", augmentation_model)




In [9]:
def split_dataset(df):
    """Split dataset into train and test sets."""
    train_test_sets = {}

    for level in ["subclass"]:
        train, test = train_test_split(df, test_size=0.2, stratify=df[level], random_state=42)
        train_test_sets[level] = (train, test)

    return train_test_sets

data_splits = split_dataset(labels_df)



In [10]:
def encode_labels(df, level):
    """Encode categorical labels into numerical values."""
    df = df.copy()  # Prevent modifying the original DataFrame
    encoder = LabelEncoder()
    df[level] = encoder.fit_transform(df[level])  # Transform labels
    return df, encoder  # Return both the updated DataFrame & encoder




In [11]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd

def add_gaussian_noise(image, stddev=10.0):  # Adjust stddev for [0,255] scale
    """Adds Gaussian noise to the image while keeping values in [0,255]."""
    stddev = tf.cast(stddev, tf.float32)  # Ensure dtype compatibility
    noise = tf.random.normal(shape=tf.shape(image), mean=0.0, stddev=stddev, dtype=image.dtype)
    image = tf.clip_by_value(image + noise, 0.0, 255.0)  # ✅ Keep values in [0,255]
    return image


def augment_rare_samples(df, level, target_samples=1000, stddev=10):
    """
    Augments rare class samples using Gaussian noise until each class reaches 1000 samples.
    """
    df_encoded, encoder = encode_labels(df, level)

    # Count occurrences of each class
    class_counts = df_encoded[level].value_counts()
    rare_classes = class_counts[class_counts < target_samples].index.tolist()

    augmented_samples = []  # ✅ Store new augmented samples

    for rare_class in rare_classes:
        rare_class_samples = df_encoded[df_encoded[level] == rare_class]
        num_existing = len(rare_class_samples)
        num_needed = target_samples - num_existing  # ✅ How many more samples are needed?

        # ✅ Sample images from the rare class (with replacement)
        sampled_images = rare_class_samples.sample(num_needed, replace=True)

        for _, row in sampled_images.iterrows():
            augmented_samples.append({
                "filename": row["filename"],  # ✅ Uses original filename
                level: rare_class,  # ✅ Maintains class label
            })

    # ✅ Convert augmented samples to DataFrame
    df_augmented = pd.DataFrame(augmented_samples)

    # ✅ Merge with original dataset
    df_final = pd.concat([df_encoded, df_augmented], ignore_index=True)

    return df_final

def create_stratified_dataset(df, image_dir, level, batch_size=128, stddev=10.0):
    """
    Creates a tf.data.Dataset where each batch contains samples from all 74 classes.
    """
    df_balanced = augment_rare_samples(df, level, stddev=stddev)  # ✅ Use the fixed stddev

    filepaths = df_balanced["filename"].apply(lambda x: os.path.join(image_dir, x))
    labels = df_balanced[level].values.astype(np.int64)  # ✅ Ensure correct dtype

    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))

    # ✅ Define preprocessing function inside create_stratified_dataset()
    def load_and_preprocess(filepath, label):
        """Loads, preprocesses, and applies noise conditionally."""
        image = tf.io.read_file(filepath)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, (224, 224))  # ✅ Resizing added
        image = tf.cast(image, tf.float32)

        # ✅ Apply noise augmentation only to duplicated samples
        image = tf.cond(
            tf.equal(tf.reduce_sum(tf.where(labels == label)), 0),  # Check if it's an augmented sample
            lambda: add_gaussian_noise(image, stddev),  # ✅ Apply Gaussian noise
            lambda: image  # ✅ Keep original images unchanged
        )
        return image, label

    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)

    # ** Create a separate dataset for each class **
    class_datasets = []
    unique_classes = np.unique(labels)

    for class_label in unique_classes:
        class_subset = dataset.filter(lambda x, y: tf.equal(y, class_label))
        class_datasets.append(class_subset)

    # ** Ensure each batch contains at least one sample from each class **
    stratified_dataset = tf.data.Dataset.sample_from_datasets(
        class_datasets, weights=[1/len(unique_classes)]*len(unique_classes)
    )

    # ✅ **Batch AFTER applying augmentation**
    stratified_dataset = stratified_dataset.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

    return stratified_dataset




In [12]:
def preprocess_image(image_path):
    """Load and preprocess an image."""
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMG_SIZE)
    return image

def create_dataset(df, image_dir, level):
    """Create TensorFlow dataset from file paths."""
    filepaths = [os.path.join(image_dir, fname) for fname in df["filename"]]
    df, _ = encode_labels(df, level)
    labels = df[level].values  # Ensure labels are properly encoded



    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))
    dataset = dataset.map(lambda x, y: (preprocess_image(x), y), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(128).prefetch(tf.data.AUTOTUNE)
    return dataset





In [13]:
train_dataset = create_stratified_dataset(data_splits["subclass"][0], "/content/drive/MyDrive/IMAGES_final", "subclass")
test_dataset = create_dataset(data_splits["subclass"][1], "/content/drive/MyDrive/IMAGES_final", "subclass")

In [14]:
train_dataset = train_dataset.map(
    lambda x, y: (tf.cast(x, tf.uint8), tf.cast(y, tf.int64))
)

test_dataset = test_dataset.map(
    lambda x, y: (tf.cast(x, tf.uint8), tf.cast(y, tf.int64))
)


In [None]:
import tensorflow as tf
import collections

# Extract labels efficiently without converting dataset to NumPy
def extract_labels(dataset):
    all_labels = []
    for _, labels in dataset:  # ✅ Avoids unnecessary NumPy conversion
        all_labels.append(labels)  # ✅ Collects tensor labels efficiently

    return tf.concat(all_labels, axis=0).numpy()  # ✅ Convert only once at the end

# 🚀 Efficient Label Extraction (No NumPy Overhead)
all_labels = extract_labels(train_dataset)

# Count occurrences
label_counts_after = collections.Counter(all_labels)

# Print label distribution
print("\nNumber of samples per label AFTER adding noise:\n")
for label, count in sorted(label_counts_after.items()):
    print(f"Label {label}: {count} samples")





In [None]:
# Print shape of the first batch
for batch_images, batch_labels in train_dataset.take(1):
    print(f"Train Dataset - Batch Images Shape: {batch_images.shape}")
    print(f"Train Dataset - Batch Labels Shape: {batch_labels.shape}")

for batch_images, batch_labels in test_dataset.take(1):
    print(f"Test Dataset - Batch Images Shape: {batch_images.shape}")
    print(f"Test Dataset - Batch Labels Shape: {batch_labels.shape}")


# Print the datatype of the first batch
for batch_images, batch_labels in train_dataset.take(1):
    print(f"Train Dataset - Batch Images dtype: {batch_images.dtype}")
    print(f"Train Dataset - Batch Labels dtype: {batch_labels.dtype}")

for batch_images, batch_labels in test_dataset.take(1):
    print(f"Test Dataset - Batch Images dtype: {batch_images.dtype}")
    print(f"Test Dataset - Batch Labels dtype: {batch_labels.dtype}")



In [16]:
def build_feature_extraction_model(shape=(224, 224, 3), summary=True):
    """Builds a ConvNeXtTiny feature extractor with a fixed input shape."""

    inputs = tf.keras.layers.Input(shape=shape)
    x = augmentation_model(inputs)

    pre_trained_model = tf.keras.applications.ConvNeXtTiny(

      include_top=False,
      weights='imagenet',
      input_shape=shape,
      pooling=None
    )
    x = pre_trained_model(x)
    x = tf.keras.layers.GlobalMaxPooling2D()(x)

    feature_extraction_model = tf.keras.Model(inputs=inputs, outputs=x)

    # Freeze pre-trained model
    pre_trained_model.trainable = False
    if summary:
        print(feature_extraction_model.summary())


    return feature_extraction_model



In [17]:
INPUT_SHAPE = IMG_SIZE + (3,)
feature_extraction_model = build_feature_extraction_model(INPUT_SHAPE)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_tiny_notop.h5
[1m111650432/111650432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


None


In [18]:
@tf.function
def batched_feature_extraction(model, image_batch):
    # Single forward pass with training=True
    return model(image_batch, training=True)

def extract_features_in_batches(model, dataset, repetitions=1):
    X, Y = [], []
    for i in range(repetitions):
        if i:
            print("\nRepetition", i)
        for batch_idx, (image_batch, label_batch) in enumerate(dataset):
            Y.extend(label_batch)
            # Single call that can be retraced once if shapes/dtypes are stable
            features = batched_feature_extraction(model, image_batch)
            X.extend(features)
    return np.array(X), np.array(Y)


In [19]:
def extract_features_with_predict(model, dataset):
    # Keras handles batching automatically in a single pass
    X = model.predict(dataset, verbose=1)
    # Gather labels
    Y_list = []
    for _, labels in dataset:
        Y_list.append(labels)
    Y = tf.concat(Y_list, axis=0)
    return X, Y.numpy()


In [20]:
### START YOUR CODE HERE ###  (≈2 LOC)

# Training features and labels
train_features, train_labels = extract_features_in_batches(feature_extraction_model, train_dataset)

# Test features and labels
test_features, test_labels = extract_features_with_predict(feature_extraction_model, test_dataset)



[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 2s/step


In [21]:
print("Shape of train_features:", train_features.shape)
print("Number of samples:", len(train_features))

print("Shape of train_features:", test_features.shape)
print("Number of samples:", len(test_features))


Shape of train_features: (58368, 768)
Number of samples: 58368
Shape of train_features: (14378, 768)
Number of samples: 14378


In [22]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(train_features, train_labels)

predictions = rf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)

print(f"Test Accuracy: {accuracy}")
print(classification_report(test_labels, predictions))


Test Accuracy: 0.6859785783836416
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       521
           1       0.50      0.01      0.02       589
           2       0.64      0.87      0.74      6771
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00       655
           5       0.76      0.68      0.72      5829

    accuracy                           0.69     14378
   macro avg       0.32      0.26      0.25     14378
weighted avg       0.63      0.69      0.64     14378



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# -- A) Weighted Random Forest -----------------------------------------------
print("\n=== Weighted Random Forest ===")
rf_weighted = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced'  # <-- Important for imbalance
)
rf_weighted.fit(train_features, train_labels)

predictions_weighted = rf_weighted.predict(test_features)
accuracy_weighted = accuracy_score(test_labels, predictions_weighted)

print(f"Weighted RF Test Accuracy: {accuracy_weighted}")
print(classification_report(test_labels, predictions_weighted))




=== Weighted Random Forest ===
Weighted RF Test Accuracy: 0.6601057170677423
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       521
           1       1.00      0.00      0.01       589
           2       0.61      0.88      0.72      6771
           3       0.00      0.00      0.00        13
           4       0.00      0.00      0.00       655
           5       0.76      0.61      0.68      5829

    accuracy                           0.66     14378
   macro avg       0.40      0.25      0.23     14378
weighted avg       0.64      0.66      0.61     14378



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [25]:
# -- B) Balanced Bagging Classifier ------------------------------------------
print("\n=== BalancedBaggingClassifier (with RF base) ===")
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(
    estimator=RandomForestClassifier(n_estimators=100, random_state=42),
    n_estimators=10,
    random_state=42
)
bbc.fit(train_features, train_labels)

predictions_bbc = bbc.predict(test_features)
accuracy_bbc = accuracy_score(test_labels, predictions_bbc)

print(f"BalancedBaggingClassifier Test Accuracy: {accuracy_bbc}")
print(classification_report(test_labels, predictions_bbc))


=== BalancedBaggingClassifier (with RF base) ===
BalancedBaggingClassifier Test Accuracy: 0.3809987480873557
              precision    recall  f1-score   support

           0       0.24      0.46      0.32       521
           1       0.07      0.60      0.13       589
           2       0.74      0.41      0.53      6771
           3       0.01      0.77      0.02        13
           4       0.17      0.24      0.20       655
           5       0.75      0.33      0.46      5829

    accuracy                           0.38     14378
   macro avg       0.33      0.47      0.28     14378
weighted avg       0.67      0.38      0.46     14378

