## 1. Import Necessary Libraries

In [10]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import metrics
from tensorflow.keras.applications import ResNet101
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.applications import ConvNeXtBase
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

## 2. Load Data + Data Preprocessing

In [3]:
# Load Data
train_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/test.csv')
print(train_df.shape[0])

# Generate file paths correctly
train_df['file_path'] = train_df.apply(
    lambda row: f"/kaggle/input/bttai-ajl-2025/train/train/{row['label']}/{row['md5hash']}.jpg", axis=1
)
test_df['file_path'] = test_df['md5hash'].apply(
    lambda x: f"/kaggle/input/bttai-ajl-2025/test/test/{x}.jpg"
)

# Data Preprocessing

# Remove invalid rows
train_df = train_df[(train_df['fitzpatrick_scale'] > 0) & (train_df['label'].notna())]
print(train_df.shape[0])

train_df = train_df[train_df['file_path'].apply(os.path.exists)]
print(train_df.shape[0])

test_df = test_df[test_df['file_path'].apply(os.path.exists)]
print(test_df.shape[0])
print()


# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])


# Splitting dataset into training and validation datasets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['encoded_label'])

# Define image data generators for training and testing
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    brightness_range=[0.9, 1.1],
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    preprocessing_function=tf.keras.applications.efficientnet.preprocess_input
)

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='file_path',
    y_col='encoded_label',
    target_size=(224, 224),
    batch_size=512,
    class_mode='raw',
    shuffle = True
)


val_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input)
val_generator = val_datagen.flow_from_dataframe(
    val_df,
    x_col='file_path',
    y_col='encoded_label',
    target_size=(224, 224),
    batch_size=512,
    class_mode='raw',
    shuffle=False
    
)


test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.efficientnet.preprocess_input)
test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='file_path',
    target_size=(224, 224),
    batch_size= 512,
    class_mode=None,
    shuffle=False
    
)


2860
2752
2752
1227

Found 2201 validated image filenames.
Found 551 validated image filenames.
Found 1227 validated image filenames.


In [4]:
# Common Model Parameters
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df['encoded_label']),
    y=train_df['encoded_label']
)
class_weights_dict = dict(enumerate(class_weights))

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Initialize the ReduceLROnPlateau callback
lr_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                 factor=0.5,  # Factor to reduce the learning rate
                                 patience=3,  # Number of epochs to wait before reducing
                                 min_lr=1e-6)  # Minimum learning rate

## 4. Model Training

In [20]:
# Load ConvNeXtTiny with pre-trained weights
base_model = ConvNeXtBase(weights="imagenet", include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False  # Freeze base model


# Build the model
model = models.Sequential([
    base_model,  # Base model (ConvNeXtTiny)
    layers.GlobalAveragePooling2D(),  # Pooling layer to reduce spatial dimensions
    layers.BatchNormalization(),  # Batch normalization to stabilize training
    layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.0005)),  # First dense layer with more units
    layers.Dropout(0.4),  # Increased dropout rate to prevent overfitting
    layers.BatchNormalization(),  # Another batch normalization layer
    layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.0005)),  # Second dense layer
    layers.Dropout(0.4),  # Dropout layer for regularization
    layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.0005)),  # Third dense layer
    layers.Dropout(0.3),  # Dropout layer
    layers.Dense(21, activation='softmax')  # Output layer with 21 classes (adjust accordingly)
])

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(train_generator, epochs=30, validation_data=val_generator, callbacks=[lr_reduction])

Epoch 1/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 8s/step - accuracy: 0.0567 - loss: 4.4915 - val_accuracy: 0.1815 - val_loss: 3.4528 - learning_rate: 0.0010
Epoch 2/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accuracy: 0.1713 - loss: 3.4934 - val_accuracy: 0.2795 - val_loss: 3.1992 - learning_rate: 0.0010
Epoch 3/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accuracy: 0.2574 - loss: 3.1648 - val_accuracy: 0.3031 - val_loss: 3.0556 - learning_rate: 0.0010
Epoch 4/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accuracy: 0.2867 - loss: 3.0152 - val_accuracy: 0.3249 - val_loss: 2.9915 - learning_rate: 0.0010
Epoch 5/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accuracy: 0.3570 - loss: 2.8068 - val_accuracy: 0.3412 - val_loss: 2.9416 - learning_rate: 0.0010
Epoch 6/30
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 4s/step - accura

<keras.src.callbacks.history.History at 0x7cc9361e8d90>

In [21]:
# Generate predictions
y_prob = model.predict(val_generator)
y_pred = np.argmax(y_prob, axis=1)
y_true = val_df['encoded_label'].values

# Print classification report
print(classification_report(y_true, y_pred))

f1 = f1_score(y_true, y_pred, average='weighted')
print("ConvNeXtTiny F1 Score:", f1)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5s/step
              precision    recall  f1-score   support

           0       0.32      0.23      0.27        26
           1       0.67      0.72      0.69        47
           2       0.35      0.29      0.32        24
           3       0.53      0.77      0.63        64
           4       0.33      0.25      0.29         8
           5       0.39      0.64      0.48        11
           6       0.62      0.48      0.54        21
           7       0.44      0.36      0.40        11
           8       0.55      0.21      0.31        28
           9       0.67      0.50      0.57        12
          10       0.76      0.77      0.76        44
          11       0.46      0.57      0.51        21
          12       0.55      0.52      0.54        21
          13       0.21      0.20      0.21        15
          14       0.65      0.50      0.57        34
          15       0.60      0.72      0.65        25
          

In [22]:
# SUBMISSION.CSV
y_pred = np.argmax(model.predict(test_generator), axis = 1)
test_df['label'] = label_encoder.inverse_transform(y_pred)

# Save submission
test_df[['md5hash', 'label']].to_csv('/kaggle/working/submission.csv', index=False)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4s/step
