In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing/manipulation, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/bttai-ajl-2025'):
    print(dirname)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# 1. Import Necessary Libraries
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, RMSprop

from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.applications import ResNet101
from tensorflow.keras.models import Model
from sklearn.model_selection import GridSearchCV


# Set random seed for reproducibility
np.random.seed(42)

# Explanation:
# - sklearn: for splitting data and encoding labels
# - tensorflow.keras: for building and training the neural network

In [3]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Unzip Dataset
!unzip "/content/drive/My Drive/bttai-ajl-2025.zip" -d "/content/dataset"

# 3. Load Data
train_df = pd.read_csv('/content/dataset/train.csv')
test_df = pd.read_csv('/content/dataset/test.csv')

# 4. Add .jpg extension to md5hash column
train_df['md5hash'] = train_df['md5hash'].astype(str) + '.jpg'
test_df['md5hash'] = test_df['md5hash'].astype(str) + '.jpg'

# 5. Create file paths
train_df['file_path'] = '/content/dataset/train/train/' + train_df['label'] + '/' + train_df['md5hash']
test_df['file_path'] = '/content/dataset/test/test/' + test_df['md5hash']

Mounted at /content/drive
Archive:  /content/drive/My Drive/bttai-ajl-2025.zip
  inflating: /content/dataset/sample_submission.csv  
  inflating: /content/dataset/test.csv  
  inflating: /content/dataset/test/test/000e8dd5ee75dd6668e978e7a4e6fe54.jpg  
  inflating: /content/dataset/test/test/0097275da3cb707415d13d2c59cf8c8c.jpg  
  inflating: /content/dataset/test/test/009c75339a21bb84a6425be6a95938d6.jpg  
  inflating: /content/dataset/test/test/016abe4884715af85cd3f309f93b9641.jpg  
  inflating: /content/dataset/test/test/018504389f4fa566232eb6e3ff838cb8.jpg  
  inflating: /content/dataset/test/test/01ba602def4506d8bfd5900cccd2ab4d.jpg  
  inflating: /content/dataset/test/test/024a737cf57eda5493e8cb30551b4e97.jpg  
  inflating: /content/dataset/test/test/0325ba9f88358e11a6abc3a63e584bf6.jpg  
  inflating: /content/dataset/test/test/032a4ac5c1a3a8a90f6e7aede2d1ab64.jpg  
  inflating: /content/dataset/test/test/0353409eed089c8165db5a968cf2c43f.jpg  
  inflating: /content/dataset/test/t

In [4]:
# Check the first few rows to understand the structure
print(train_df.head())

                                md5hash  fitzpatrick_scale  \
0  fd06d13de341cc75ad679916c5d7e6a6.jpg                  4   
1  a4bb4e5206c4e89a303f470576fc5253.jpg                  1   
2  c94ce27e389f96bda998e7c3fa5c4a2e.jpg                  5   
3  ebcf2b50dd943c700d4e2b586fcd4425.jpg                  3   
4  c77d6c895f05fea73a8f3704307036c0.jpg                  1   

   fitzpatrick_centaur                             label nine_partition_label  \
0                    4                 prurigo-nodularis     benign-epidermal   
1                    1  basal-cell-carcinoma-morpheiform  malignant-epidermal   
2                    5                            keloid         inflammatory   
3                    3              basal-cell-carcinoma  malignant-epidermal   
4                    1                 prurigo-nodularis     benign-epidermal   

  three_partition_label            qc  ddi_scale  \
0                benign           NaN         34   
1             malignant           Na

In [5]:
# # Filter the dataset

# skin_conditions = [
#     'prurigo-nodularis', 'basal-cell-carcinoma-morpheiform', 'keloid',
#     'basal-cell-carcinoma', 'seborrheic-keratosis', 'eczema', 'folliculitis',
#     'squamous-cell-carcinoma', 'actinic-keratosis', 'mycosis-fungoides',
#     'acne-vulgaris', 'dyshidrotic-eczema', 'melanoma', 'epidermal-nevus',
#     'malignant-melanoma', 'pyogenic-granuloma'
# ]

# train_df = train_df[train_df["label"].isin(skin_conditions)]

In [6]:
# 3. Data Preprocessing
# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])

# Split data into training and validation sets
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['encoded_label'])

# Compute class weights to handle imbalance
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['encoded_label']),
    y=train_df['encoded_label']
)
class_weight_dict = dict(enumerate(class_weights))
print("Class weights:", class_weight_dict)

# Data generators with augmentation to prevent overfitting
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

val_datagen = ImageDataGenerator(rescale=1./255)

# Directory paths
train_dir = '/kaggle/input/bttai-ajl-2025/train/train/'

Class weights: {0: np.float64(1.0639880952380953), 1: np.float64(0.582010582010582), 2: np.float64(1.1163153786104605), 3: np.float64(0.4152148664343786), 4: np.float64(3.1672203765227023), 5: np.float64(2.4761904761904763), 6: np.float64(1.2848158131176999), 7: np.float64(2.348111658456486), 8: np.float64(0.9523809523809523), 9: np.float64(2.1279761904761907), 10: np.float64(0.5698346284120342), 11: np.float64(1.2494539100043687), 12: np.float64(1.2494539100043687), 13: np.float64(1.746031746031746), 14: np.float64(0.7524335701131282), 15: np.float64(1.072365954255718), 16: np.float64(1.1444577831132452), 17: np.float64(1.723930078360458), 18: np.float64(2.8373015873015874), 19: np.float64(0.33462033462033464), 20: np.float64(1.6408491107286287)}


In [7]:
def create_generator(datagen, dataframe, directory, batch_size=32, target_size=(224, 224)): # Enable different generators for train, val, test sets
    """
    Template function to create image generators.
    Students should complete this function to load images and labels properly.
    """
    # Fill in the correct flow_from_dataframe parameters
    generator = train_datagen.flow_from_dataframe(
        dataframe=dataframe,
        directory=directory,
        x_col='file_path',  # Use combined path
        y_col='encoded_label',
        target_size=target_size,
        batch_size=batch_size,
        class_mode='raw',
        validate_filenames=False  # Disable strict filename validation
    )
    return generator

In [8]:
# Create generators
train_generator = create_generator(train_datagen, train_data, train_dir)
val_generator = create_generator(val_datagen, val_data, train_dir)

Found 2288 non-validated image filenames.
Found 572 non-validated image filenames.


In [9]:
def create_resnet_model(learning_rate=0.001, dropout_rate=0.5, optimizer='adam'):
    # Base ResNet101 model
    base_model = ResNet101(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

    # Freeze the base model
    base_model.trainable = False

    # Add custom layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(512, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    output = Dense(len(label_encoder.classes_), activation='softmax')(x)

    model = Model(inputs=base_model.input, outputs=output)

    # Compile the model
    if optimizer == 'adam':
        opt = Adam(learning_rate=learning_rate)
    else:
        opt = RMSprop(learning_rate=learning_rate)

    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

In [10]:
# Define the hyperparameter grid
param_grid = {
    'learning_rate': [0.001, 0.0005],
    'dropout_rate': [0.3, 0.5],
    'optimizer': ['adam', 'rmsprop'],
    'batch_size': [32, 64]
}

In [11]:
# Get all combinations of hyperparameters
import itertools
param_combinations = list(itertools.product(*param_grid.values()))

best_f1 = 0
best_params = None
best_model = None

# Loop through each combination
for params in param_combinations:
    lr, dropout, opt, batch_size = params
    print(f"Training with params: LR={lr}, Dropout={dropout}, Optimizer={opt}, Batch Size={batch_size}")

    # Create the model
    model = create_resnet_model(learning_rate=lr, dropout_rate=dropout, optimizer=opt)

    # Train the model
    history = model.fit(
        train_generator,
        validation_data=val_generator,
        epochs=3,
        batch_size=batch_size,
        class_weight=class_weight_dict,
        verbose=1
    )

    # Evaluate on validation set
    val_generator.reset()
    val_preds = model.predict(val_generator)
    val_preds_labels = np.argmax(val_preds, axis=1)

    f1 = f1_score(val_data['encoded_label'], val_preds_labels, average='macro')
    print(f"F1 Score: {f1:.4f}")

    # Store the best model
    if f1 > best_f1:
        best_f1 = f1
        best_params = params
        best_model = model

print(f"Best F1 Score: {best_f1:.4f}")
print(f"Best Hyperparameters: {best_params}")


Training with params: LR=0.001, Dropout=0.3, Optimizer=adam, Batch Size=32
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet101_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m171446536/171446536[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/3
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1012s[0m 14s/step - accuracy: 0.0587 - loss: 3.3048 - val_accuracy: 0.0297 - val_loss: 3.0514
Epoch 2/3
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1048s[0m 14s/step - accuracy: 0.0417 - loss: 3.1345 - val_accuracy: 0.0315 - val_loss: 3.0241
Epoch 3/3
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m970s[0m 13s/step - accuracy: 0.0415 - loss: 3.0927 - val_accuracy: 0.0280 - val_loss: 3.0549
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 11s/step
F1 Score: 0.0089
Training with params: LR=0.001, Dropout=0.3, Optimizer=adam, Batch Size=64
Epoch 1/3
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m991s[0m 14s/step - accuracy: 0.0307 - loss: 3.3880 - val_accuracy: 0.0315 - val_loss: 3.0461
Epoch 2/3
[1m72/72[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m963s[0m 13s/step - accuracy: 0.0385 - loss: 3.0717 - val_accuracy: 0.0962 - val_loss: 3.0379
Epoch 3/3
[1m72/72[0m [

KeyboardInterrupt: 

In [12]:
# Final evaluation on validation set
val_generator.reset()
val_preds = best_model.predict(val_generator)
val_preds_labels = np.argmax(val_preds, axis=1)

# Calculate F1 score and accuracy
f1 = f1_score(val_data['encoded_label'], val_preds_labels, average='macro')
accuracy = accuracy_score(val_data['encoded_label'], val_preds_labels)

print(f"Validation F1 Score: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 11s/step
Validation F1 Score: 0.0062
Validation Accuracy: 0.0280


In [13]:
# Preprocess the test data
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    x_col='file_path',
    target_size=(224, 224),
    batch_size=32,
    class_mode=None,
    shuffle=False,
    validate_filenames=False
)


Found 1227 non-validated image filenames.


In [1]:
# Make predictions
test_generator.reset()
test_preds = best_model.predict(test_generator)
test_labels = np.argmax(test_preds, axis=1)

# Map predictions to class labels
test_df['prediction'] = label_encoder.inverse_transform(test_labels)

# Save to CSV
submission = test_df[['md5hash', 'prediction']]
submission.to_csv('resnet101_submission.csv', index=False)
print("Test predictions saved.")


NameError: name 'test_generator' is not defined