In [30]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_labels(file_path):
    """Load labels from an Excel file."""
    try:
        df = pd.read_excel(file_path)
        print("Dataset Loaded Successfully!")
        print(df.head())
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

# Load dataset
labels_df = load_labels("/mnt/data/labels..xlsx")
if labels_df is None:
    raise FileNotFoundError("Labels file is missing! Check file path.")


Dataset Loaded Successfully!
   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0      family          genus  \
0             0         68711       70465   Veneridae  Hysteroconcha   
1             1          6018        6162   Cardiidae   Acrosterigma   
2             2         42500       44103  Pectinidae   Decatopecten   
3             3         32647       34008   Mytilidae   Brachidontes   
4             4         11386       11538  Carditidae        Cardita   

                  species                                           filename  \
0  Hysteroconchalupanaria  Veneridae_Hysteroconcha_lupanaria_allspira_26-...   
1  Acrosterigmaattenuatum    Cardiidae_Acrosterigma_attenuatum_bigai_010.jpg   
2    Decatopectenamiculum  Pectinidae_Decatopecten_amiculum_Poppe_263539-...   
3  Brachidontessemilaevis  Mytilidae_Brachidontes_semilaevis_Poppe_284926...   
4     Carditaplanicostata  Carditidae_Cardita_planicostata_PRI_132-view2-...   

  ansicht      order          subclass  ... family_idx  g

In [4]:
def find_rare_classes(df, level, threshold=10):
    """Find rare classes in a given taxonomic level."""
    class_counts = df[level].value_counts()
    rare_classes = class_counts[class_counts < threshold].index
    return list(rare_classes)

# Identify rare classes
rare_classes = {level: find_rare_classes(labels_df, level) for level in ["family", "order", "subclass"]}
print("Rare classes:", rare_classes)




Rare classes: {'family': ['Cyamiidae', 'Cleidothaeridae'], 'order': ['Cyamioidea'], 'subclass': []}


In [5]:
# Print the number of samples for each family
family_counts = labels_df['family'].value_counts()
print("\nNumber of samples for each family:")
for family, count in family_counts.items():
    print(f"Family: {family}, Count: {count}")


Number of samples for each family:
Family: Pectinidae, Count: 9293
Family: Cardiidae, Count: 5390
Family: Tellinidae, Count: 5330
Family: Veneridae, Count: 4725
Family: Mytilidae, Count: 4141
Family: Arcidae, Count: 4038
Family: Lucinidae, Count: 3140
Family: Mactridae, Count: 2408
Family: Spondylidae, Count: 2033
Family: Glycymerididae, Count: 1999
Family: Carditidae, Count: 1707
Family: Limidae, Count: 1552
Family: Donacidae, Count: 1518
Family: Psammobiidae, Count: 1459
Family: Nuculanidae, Count: 1164
Family: Semelidae, Count: 1136
Family: Ostreidae, Count: 1117
Family: Nuculidae, Count: 1055
Family: Cuspidariidae, Count: 978
Family: Corbulidae, Count: 946
Family: Lasaeidae, Count: 944
Family: Propeamussiidae, Count: 931
Family: Chamidae, Count: 834
Family: Pinnidae, Count: 824
Family: Pharidae, Count: 799
Family: Anomiidae, Count: 761
Family: Pholadidae, Count: 757
Family: Astartidae, Count: 665
Family: Yoldiidae, Count: 663
Family: Crassatellidae, Count: 573
Family: Glossidae, C

In [6]:
# Print the number of samples for each family
order_counts = labels_df['order'].value_counts()
print("\nNumber of samples for each order:")
for order, count in order_counts.items():
    print(f"Order: {order}, Count: {count}")


Number of samples for each order:
Order: Cardiida, Count: 15237
Order: Pectinida, Count: 13344
Order: Venerida, Count: 9826
Order: Arcida, Count: 6826
Order: Mytilida, Count: 4141
Order: Lucinida, Count: 3605
Order: Ostreida, Count: 3282
Order: Carditida, Count: 2945
Order: Myida, Count: 2275
Order: Nuculanida, Count: 1996
Order: Adapedonta, Count: 1577
Order: Limida, Count: 1552
Order: Galeommatida, Count: 1144
Order: Nuculida, Count: 1055
Order: Cuspidarioidea, Count: 978
Order: Thracioidea, Count: 536
Order: Pandoroidea, Count: 526
Order: Solemyida, Count: 222
Order: Laternuloidea, Count: 193
Order: Poromyoidea, Count: 163
Order: Myochamoidea, Count: 123
Order: Gaimardioidea, Count: 110
Order: Verticordioidea, Count: 87
Order: Gastrochaenida, Count: 71
Order: Trigoniida, Count: 67
Order: Cyamioidea, Count: 7


In [7]:
# Print the number of samples for each family
subclass_counts = labels_df['subclass'].value_counts()
print("\nNumber of samples for each subclass:")
for subclass, count in subclass_counts.items():
    print(f"Subclass: {subclass}, Count: {count}")


Number of samples for each subclass:
Subclass: Imparidentia, Count: 33852
Subclass: Pteriomorphia, Count: 29145
Subclass: Protobranchia, Count: 3273
Subclass: Archiheterodonta, Count: 2945
Subclass: Anomalodesmata, Count: 2606
Subclass: Paleoheterodonta, Count: 67


In [5]:
def build_augmentation_model(shape=(224, 224, 3)):



    augmentation_model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(shape=shape),
        tf.keras.layers.RandomFlip("horizontal_and_vertical"),
        tf.keras.layers.RandomRotation(0.2),
        tf.keras.layers.RandomZoom(0.2),
        tf.keras.layers.RandomBrightness(0.1, value_range=[0, 255]),
        tf.keras.layers.RandomContrast(0.2),
        tf.keras.layers.GaussianNoise(0.02),
        tf.keras.layers.RandomTranslation(0.1, 0.1),
        tf.keras.layers.RandomCrop(height=shape[0], width=shape[1]),
        tf.keras.layers.Lambda(lambda x: x, output_shape=shape)
    ])
    return augmentation_model


In [6]:
augmentation_model = build_augmentation_model(shape=(224, 224, 3))

In [7]:
IMG_SIZE = (224, 224)

def augment_image(image_path, augmentation_model):
    """Apply augmentation to an image."""


    img = cv2.imread(image_path)
    img = cv2.resize(img, IMG_SIZE)
    img = tf.convert_to_tensor(img, dtype=tf.float32)

    img = augmentation_model(tf.expand_dims(img, axis=0))

    return tf.squeeze(img).numpy()

def augment_dataset(df, image_dir, augmentation_model, target_samples=5):
    """Augment images for rare classes."""
    augmented_samples = []

    for level in ["order"]:
        for cls in find_rare_classes(df, level):
            class_samples = df[df[level] == cls]
            num_samples = len(class_samples)

            for i in range(target_samples - num_samples):
                sample = class_samples.sample(1).iloc[0]
                img_path = os.path.join(image_dir, sample["filename"])

                aug_img = augment_image(img_path, augmentation_model)
                if aug_img is not None:
                    aug_filename = f"aug_{i}_{sample['filename']}"
                    aug_path = os.path.join(image_dir, aug_filename)
                    cv2.imwrite(aug_path, cv2.cvtColor(aug_img, cv2.COLOR_RGB2BGR))

                    new_sample = sample.copy()
                    new_sample["filename"] = aug_filename
                    augmented_samples.append(new_sample)

    return pd.concat([df, pd.DataFrame(augmented_samples)], ignore_index=True)


labels_df = augment_dataset(labels_df, "/content/drive/MyDrive/IMAGES_final", augmentation_model)




In [8]:
def split_dataset(df):
    """Split dataset into train and test sets."""
    train_test_sets = {}

    for level in ["order"]:
        train, test = train_test_split(df, test_size=0.2, stratify=df[level], random_state=42)
        train_test_sets[level] = (train, test)

    return train_test_sets

data_splits = split_dataset(labels_df)



In [9]:
def encode_labels(df, level):
    """Encode categorical labels into numerical values."""
    df = df.copy()  # Prevent modifying the original DataFrame
    encoder = LabelEncoder()
    df[level] = encoder.fit_transform(df[level])  # Transform labels
    return df, encoder  # Return both the updated DataFrame & encoder




In [10]:
import os
import tensorflow as tf
import numpy as np
import pandas as pd

def add_gaussian_noise(image, stddev=10.0):  # Adjust stddev for [0,255] scale
    """Adds Gaussian noise to the image while keeping values in [0,255]."""
    stddev = tf.cast(stddev, tf.float32)  # Ensure dtype compatibility
    noise = tf.random.normal(shape=tf.shape(image), mean=0.0, stddev=stddev, dtype=image.dtype)
    image = tf.clip_by_value(image + noise, 0.0, 255.0)  # ✅ Keep values in [0,255]
    return image


def augment_rare_samples(df, level, target_samples=1000, stddev=10):
    """
    Augments rare class samples using Gaussian noise until each class reaches 1000 samples.
    """
    df_encoded, encoder = encode_labels(df, level)

    # Count occurrences of each class
    class_counts = df_encoded[level].value_counts()
    rare_classes = class_counts[class_counts < target_samples].index.tolist()

    augmented_samples = []  # ✅ Store new augmented samples

    for rare_class in rare_classes:
        rare_class_samples = df_encoded[df_encoded[level] == rare_class]
        num_existing = len(rare_class_samples)
        num_needed = target_samples - num_existing  # ✅ How many more samples are needed?

        # ✅ Sample images from the rare class (with replacement)
        sampled_images = rare_class_samples.sample(num_needed, replace=True)

        for _, row in sampled_images.iterrows():
            augmented_samples.append({
                "filename": row["filename"],  # ✅ Uses original filename
                level: rare_class,  # ✅ Maintains class label
            })

    # ✅ Convert augmented samples to DataFrame
    df_augmented = pd.DataFrame(augmented_samples)

    # ✅ Merge with original dataset
    df_final = pd.concat([df_encoded, df_augmented], ignore_index=True)

    return df_final

def create_stratified_dataset(df, image_dir, level, batch_size=128, stddev=10.0):
    """
    Creates a tf.data.Dataset where each batch contains samples from all 74 classes.
    """
    df_balanced = augment_rare_samples(df, level, stddev=stddev)  # ✅ Use the fixed stddev

    filepaths = df_balanced["filename"].apply(lambda x: os.path.join(image_dir, x))
    labels = df_balanced[level].values.astype(np.int64)  # ✅ Ensure correct dtype

    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))

    # ✅ Define preprocessing function inside create_stratified_dataset()
    def load_and_preprocess(filepath, label):
        """Loads, preprocesses, and applies noise conditionally."""
        image = tf.io.read_file(filepath)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, (224, 224))  # ✅ Resizing added
        image = tf.cast(image, tf.float32)

        # ✅ Apply noise augmentation only to duplicated samples
        image = tf.cond(
            tf.equal(tf.reduce_sum(tf.where(labels == label)), 0),  # Check if it's an augmented sample
            lambda: add_gaussian_noise(image, stddev),  # ✅ Apply Gaussian noise
            lambda: image  # ✅ Keep original images unchanged
        )
        return image, label

    dataset = dataset.map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE)

    # ** Create a separate dataset for each class **
    class_datasets = []
    unique_classes = np.unique(labels)

    for class_label in unique_classes:
        class_subset = dataset.filter(lambda x, y: tf.equal(y, class_label))
        class_datasets.append(class_subset)

    # ** Ensure each batch contains at least one sample from each class **
    stratified_dataset = tf.data.Dataset.sample_from_datasets(
        class_datasets, weights=[1/len(unique_classes)]*len(unique_classes)
    )

    # ✅ **Batch AFTER applying augmentation**
    stratified_dataset = stratified_dataset.batch(batch_size, drop_remainder=True).prefetch(tf.data.AUTOTUNE)

    return stratified_dataset




In [11]:
def preprocess_image(image_path):
    """Load and preprocess an image."""
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, IMG_SIZE)
    return image

def create_dataset(df, image_dir, level):
    """Create TensorFlow dataset from file paths."""
    filepaths = [os.path.join(image_dir, fname) for fname in df["filename"]]
    df, _ = encode_labels(df, level)
    labels = df[level].values  # Ensure labels are properly encoded



    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels))
    dataset = dataset.map(lambda x, y: (preprocess_image(x), y), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.batch(128).prefetch(tf.data.AUTOTUNE)
    return dataset





In [12]:
train_dataset = create_stratified_dataset(data_splits["order"][0], "/content/drive/MyDrive/IMAGES_final", "order")
test_dataset = create_dataset(data_splits["order"][1], "/content/drive/MyDrive/IMAGES_final", "order")

In [13]:
train_dataset = train_dataset.map(
    lambda x, y: (tf.cast(x, tf.uint8), tf.cast(y, tf.int64))
)

test_dataset = test_dataset.map(
    lambda x, y: (tf.cast(x, tf.uint8), tf.cast(y, tf.int64))
)


In [17]:
import tensorflow as tf

def get_label_counts(dataset, num_classes):
    """
    Accumulate label counts in a single pass over the dataset
    using tf.math.bincount.
    """
    # First map out just the labels.
    only_labels_ds = dataset.map(lambda x, y: tf.cast(y, tf.int64))

    # Initialize the accumulator as int64
    initial_state = tf.zeros([num_classes], dtype=tf.int64)

    # Reduce over the dataset, batching the counts with bincount.
    counts = only_labels_ds.reduce(
        initial_state,
        lambda accumulated_counts, labels:
            accumulated_counts + tf.math.bincount(
                tf.reshape(labels, [-1]),
                minlength=num_classes,
                dtype=tf.int64  # <--- make sure output is int64
            )
    )
    return counts




In [None]:
# Example usage:
# If you know there are, say, 10 classes:
num_classes = 26
label_counts = get_label_counts(train_dataset, num_classes)

# Convert to NumPy if you need Python ints for a summary:
label_counts_np = label_counts.numpy()

# Print the distribution
for label, count in enumerate(label_counts_np):
    print(f"Label {label}: {count} samples")

In [14]:
# Print shape of the first batch
for batch_images, batch_labels in train_dataset.take(1):
    print(f"Train Dataset - Batch Images Shape: {batch_images.shape}")
    print(f"Train Dataset - Batch Labels Shape: {batch_labels.shape}")
    print(f"Train Dataset - Batch Images dtype: {batch_images.dtype}")
    print(f"Train Dataset - Batch Labels dtype: {batch_labels.dtype}")

for batch_images, batch_labels in test_dataset.take(1):
    print(f"Test Dataset - Batch Images Shape: {batch_images.shape}")
    print(f"Test Dataset - Batch Labels Shape: {batch_labels.shape}")
    print(f"Test Dataset - Batch Images dtype: {batch_images.dtype}")
    print(f"Test Dataset - Batch Labels dtype: {batch_labels.dtype}")





Train Dataset - Batch Images Shape: (128, 224, 224, 3)
Train Dataset - Batch Labels Shape: (128,)
Test Dataset - Batch Images Shape: (128, 224, 224, 3)
Test Dataset - Batch Labels Shape: (128,)
Train Dataset - Batch Images dtype: <dtype: 'uint8'>
Train Dataset - Batch Labels dtype: <dtype: 'int64'>
Test Dataset - Batch Images dtype: <dtype: 'uint8'>
Test Dataset - Batch Labels dtype: <dtype: 'int64'>


In [28]:
def build_feature_extraction_model(shape=(224, 224, 3), summary=True):
    """Builds a ConvNeXtTiny feature extractor with a fixed input shape."""

    inputs = tf.keras.layers.Input(shape=shape)
    x = augmentation_model(inputs)

    pre_trained_model = tf.keras.applications.ConvNeXtTiny(

      include_top=False,
      weights='imagenet',
      input_shape=shape,
      pooling=None
    )
    x = pre_trained_model(x)
    x = tf.keras.layers.GlobalMaxPooling2D()(x)

    feature_extraction_model = tf.keras.Model(inputs=inputs, outputs=x)

    # Freeze pre-trained model
    pre_trained_model.trainable = False
    if summary:
        print(feature_extraction_model.summary())


    return feature_extraction_model



In [16]:
INPUT_SHAPE = IMG_SIZE + (3,)
feature_extraction_model = build_feature_extraction_model(INPUT_SHAPE)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/convnext/convnext_tiny_notop.h5
[1m111650432/111650432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


None


In [21]:
@tf.function
def batched_feature_extraction(model, image_batch):
    # Single forward pass with training=True
    return model(image_batch, training=True)

def extract_features_in_batches(model, dataset, repetitions):
    X, Y = [], []
    for i in range(repetitions):
        if i:
            print("\nRepetition", i)
        for batch_idx, (image_batch, label_batch) in enumerate(dataset):
            Y.extend(label_batch)
            # Single call that can be retraced once if shapes/dtypes are stable
            features = batched_feature_extraction(model, image_batch)
            X.extend(features)
    return np.array(X), np.array(Y)


In [22]:
def extract_features_with_predict(model, dataset):
    # Keras handles batching automatically in a single pass
    X = model.predict(dataset, verbose=1)
    # Gather labels
    Y_list = []
    for _, labels in dataset:
        Y_list.append(labels)
    Y = tf.concat(Y_list, axis=0)
    return X, Y.numpy()


In [23]:
### START YOUR CODE HERE ###  (≈2 LOC)

# Training features and labels
train_features, train_labels = extract_features_in_batches(feature_extraction_model, train_dataset, repetitions=1 )

# Test features and labels
test_features, test_labels = extract_features_with_predict(feature_extraction_model, test_dataset)



[1m113/113[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m231s[0m 2s/step


In [24]:
print("Shape of train_features:", train_features.shape)
print("Number of samples:", len(train_features))

print("Shape of train_features:", test_features.shape)
print("Number of samples:", len(test_features))


Shape of train_features: (67200, 768)
Number of samples: 67200
Shape of train_features: (14378, 768)
Number of samples: 14378


In [31]:
# --Insert SMOTE Code Here--
# Initialize SMOTE
smote = SMOTE(sampling_strategy="auto", random_state=42)  # auto = balances all minority classes

# Apply SMOTE to training data (do not apply to test data!)
X_train_smote, y_train_smote = smote.fit_resample(train_features, train_labels)

# Check new class distribution
print("Class distribution before SMOTE:", np.bincount(train_labels))
print("Class distribution after SMOTE:", np.bincount(y_train_smote))
# --End of SMOTE Code--

print("Shape of train_features:", X_train_smote.shape)
print("Number of samples:", len(X_train_smote))

print("Shape of train_labels:", y_train_smote.shape)
print("Number of samples:", len(y_train_smote))


Class distribution before SMOTE: [ 1261  5461 12105  2356  1000  1000  1000  1000  1000  1000  1242  2884
  1820  1000  3313  1597  1000  2625  1000 10675  1000  1000  1000  1000
  7861  1000]
Class distribution after SMOTE: [12105 12105 12105 12105 12105 12105 12105 12105 12105 12105 12105 12105
 12105 12105 12105 12105 12105 12105 12105 12105 12105 12105 12105 12105
 12105 12105]
Shape of train_features: (67200, 768)
Number of samples: 67200
Shape of train_features: (14378, 768)
Number of samples: 14378


In [32]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_smote, y_train_smote)

predictions = rf.predict(test_features)
accuracy = accuracy_score(test_labels, predictions)

print(f"Test Accuracy: {accuracy}")
print(classification_report(test_labels, predictions))


Test Accuracy: 0.3547085825566838
              precision    recall  f1-score   support

           0       0.67      0.17      0.27       316
           1       0.29      0.28      0.29      1365
           2       0.40      0.44      0.42      3048
           3       0.17      0.35      0.23       589
           4       0.26      0.19      0.22       196
           5       0.00      0.00      0.00         1
           6       0.09      0.55      0.15        22
           7       0.18      0.09      0.12       229
           8       0.04      0.36      0.07        14
           9       0.10      0.46      0.17        39
          10       0.26      0.15      0.19       310
          11       0.20      0.21      0.20       721
          12       0.14      0.04      0.06       455
          13       0.02      0.08      0.03        25
          14       0.49      0.25      0.33       828
          15       0.12      0.06      0.08       399
          16       0.10      0.07      0.08    