## 1. Import Necessary Libraries

In [20]:
# Import Necessary Libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import metrics

## 2. Load Data

Make sure to verify the file paths if you're running on a different platform.

In [21]:
# Load Data
train_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/train.csv')
test_df = pd.read_csv('/kaggle/input/bttai-ajl-2025/test.csv')

# Generate file paths correctly
train_df['file_path'] = train_df.apply(
    lambda row: f"/kaggle/input/bttai-ajl-2025/train/train/{row['label']}/{row['md5hash']}.jpg", axis=1
)
test_df['file_path'] = test_df['md5hash'].apply(
    lambda x: f"/kaggle/input/bttai-ajl-2025/test/test/{x}.jpg"
)

# Remove invalid rows
train_df = train_df[(train_df['fitzpatrick_scale'] > 0) & (train_df['label'].notna())]
train_df = train_df[train_df['file_path'].apply(os.path.exists)]
test_df = test_df[test_df['file_path'].apply(os.path.exists)]


## 3. Data Preprocessing


This section demonstrates basic preprocessing techniques. To enhance data quality and model performance, consider incorporating more advanced preprocessing methods.

For further guidance, feel free to take a look at the [Image Preprocessing tutorial](https://colab.research.google.com/drive/1-ItNcRMbZBE6BCwPT-wD8m3YmHqwHxme?usp=sharing)  available in the 'Resources' section of this Kaggle competition.


In [29]:
# Data Preprocessing

# Encode the labels
label_encoder = LabelEncoder()
train_df['encoded_label'] = label_encoder.fit_transform(train_df['label'])


# Splitting dataset into training and validation datasets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df['encoded_label'])



# Define image data generators for training and testing
train_datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    brightness_range=[0.9, 1.1],
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

train_generator = train_datagen.flow_from_dataframe(
    train_df,
    x_col='file_path',
    y_col='encoded_label',
    target_size=(224, 224),
    batch_size=128,
    class_mode='raw'
)


val_datagen = ImageDataGenerator()
val_generator = val_datagen.flow_from_dataframe(
    val_df,
    x_col='file_path',
    target_size=(224, 224),
    batch_size=128,
    class_mode=None,
    shuffle=False
    
)

# Compute Class Weights
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_df['encoded_label']),
    y=train_df['encoded_label']
)
class_weights_dict = dict(enumerate(class_weights))

test_datagen = ImageDataGenerator()
test_generator = test_datagen.flow_from_dataframe(
    test_df,
    x_col='file_path',
    target_size=(224, 224),
    batch_size=32,
    class_mode=None,
    shuffle=False
    
)

Found 1760 validated image filenames.
Found 441 validated image filenames.
Found 1227 validated image filenames.


### ResNet50 Model

In [23]:
# Import Necessary Libraries
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import Dense, Flatten, Dropout, GlobalAveragePooling2D

In [25]:
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

model = Sequential([
    base_model,
    GlobalAveragePooling2D(),  # Reduces feature maps to a vector
    Dense(128, activation='relu'),
    Dropout(0.25),  # Dropout to prevent overfitting
    Dense(21, activation='softmax')  # Output layer (21 classes)
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(train_generator, epochs=20,class_weight=class_weights_dict)


Epoch 1/20


  self._warn_if_super_not_called()


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 1s/step - accuracy: 0.0789 - loss: 3.2266
Epoch 2/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.1968 - loss: 2.5984
Epoch 3/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.2530 - loss: 2.2901
Epoch 4/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.3239 - loss: 2.1718
Epoch 5/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.3717 - loss: 1.9918
Epoch 6/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.4081 - loss: 1.7626
Epoch 7/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.4203 - loss: 1.6589
Epoch 8/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 1s/step - accuracy: 0.4377 - loss: 1.6243
Epoch 9/20
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x7aeb9c755810>

In [30]:

# Generate predictions
y_pred = np.argmax(model.predict(val_generator), axis=1)  # Convert probabilities to class indices
y_true = val_df['encoded_label'].values  # Get actual labels from validation dataset

# Calculate F1 Score
f1 = f1_score(y_true, y_pred, average='weighted')  # Use 'weighted' for imbalanced data
print("F1 Score:", f1)


# SUBMISSION.CSV
y_pred = np.argmax(model.predict(test_generator), axis = 1)
test_df['label'] = label_encoder.inverse_transform(y_pred)

# Save submission
test_df[['md5hash', 'label']].to_csv('/kaggle/working/submission.csv', index=False)

  self._warn_if_super_not_called()


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2s/step  
F1 Score: 0.7524760398093262


  self._warn_if_super_not_called()


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 321ms/step
