In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

# Đường dẫn đến dữ liệu
train_dir = 'C:/Users/pagmas.saint/Downloads/ai-1810-dpl-302-m-butterfly-image-classification/train/train'
test_dir = 'C:/Users/pagmas.saint/Downloads/ai-1810-dpl-302-m-butterfly-image-classification/test/test'
train_csv = 'C:/Users/pagmas.saint/Downloads/ai-1810-dpl-302-m-butterfly-image-classification/Training_set.csv'

# Đọc dữ liệu train từ CSV
df_train = pd.read_csv(train_csv)

In [5]:
# Tạo ImageDataGenerator cho augmentation và preprocessing
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    horizontal_flip=True,
    validation_split=0.1  # Chia dữ liệu train/validation
)

# Tạo generator cho dữ liệu train và validation
train_generator = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    directory=train_dir,
    x_col='filename',
    y_col='label',
    target_size=(224, 224), # Adjust size as needed
    batch_size=64, # Adjust batch size as needed
    class_mode='categorical',
    subset='training'
)

validation_generator = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    directory=train_dir,
    x_col='filename',
    y_col='label',
    target_size=(224, 224), # Adjust size as needed
    batch_size=64, # Adjust batch size as needed
    class_mode='categorical',
    subset='validation'
)

Found 4500 validated image filenames belonging to 75 classes.
Found 500 validated image filenames belonging to 75 classes.


In [3]:
# Xây dựng mô hình CNN
model = keras.Sequential([
    keras.layers.Input(shape=(224, 224, 3)),
    keras.layers.Conv2D(32, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(64, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(128, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(256, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Conv2D(512, (3, 3), activation='relu'),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D((2, 2)),
    keras.layers.Flatten(),
    keras.layers.Dense(128, activation='relu'),  # Adjust units as needed
    keras.layers.Dropout(0.3), # Add dropout for regularization
    keras.layers.Dense(75, activation='softmax') # 75 classes
])
model.summary()

In [10]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.000125),  # Set initial learning rate
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Callback: Lưu lại mô hình tốt nhất dựa trên độ chính xác trên tập validation
checkpoint = ModelCheckpoint("best_model.keras", 
                             monitor="val_accuracy", 
                             save_best_only=True, 
                             mode="max", 
                             verbose=1)

# Callback: Giảm learning rate nếu validation loss không giảm
reduce_lr = ReduceLROnPlateau(monitor="val_loss", 
                              factor=0.5, 
                              patience=5, 
                              min_lr=1e-6, 
                              verbose=1)

# Callback: Dừng huấn luyện nếu mô hình không cải thiện sau nhiều epoch
early_stopping = EarlyStopping(monitor="val_accuracy", 
                               patience=10, 
                               restore_best_weights=True, 
                               verbose=1)

# Huấn luyện mô hình
history = model.fit(
    train_generator,
    epochs= 100,  # Increase epochs to let callbacks work
    validation_data=validation_generator,
    callbacks=[checkpoint, reduce_lr, early_stopping],  # Add callbacks
)

Epoch 1/100
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 726ms/step - accuracy: 0.8586 - loss: 0.4364
Epoch 1: val_accuracy improved from -inf to 0.80000, saving model to best_model.keras
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 838ms/step - accuracy: 0.8585 - loss: 0.4366 - val_accuracy: 0.8000 - val_loss: 0.9056 - learning_rate: 1.2500e-04
Epoch 2/100
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 647ms/step - accuracy: 0.8664 - loss: 0.4126
Epoch 2: val_accuracy did not improve from 0.80000
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 727ms/step - accuracy: 0.8663 - loss: 0.4127 - val_accuracy: 0.7960 - val_loss: 0.9199 - learning_rate: 1.2500e-04
Epoch 3/100
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 653ms/step - accuracy: 0.8570 - loss: 0.4480
Epoch 3: val_accuracy did not improve from 0.80000
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 733ms/step - accuracy: 0

In [None]:
# Đọc dữ liệu test và dự đoán
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
    test_dir,
    target_size=(224, 224),
    batch_size=1,  # Predict one image at a time
    class_mode=None,  # No labels for test data
    shuffle=False  # Important for maintaining order
)

#load best model
model = keras.models.load_model("best_model.keras")

# Dự đoán trên tập test
predictions = model.predict(test_generator, steps=len(test_generator))
predicted_classes = np.argmax(predictions, axis=1)

# Mapping số lớp sang tên loài bướm
class_indices = train_generator.class_indices  # {'name1': 0, 'name2': 1, ...}
index_to_label = {v: k for k, v in class_indices.items()}  # Đảo ngược dict

# Chuyển class số sang tên bướm
predicted_labels = [index_to_label[idx] for idx in predicted_classes]

# Tạo submission file
filenames = test_generator.filenames
filenames = [filename.split('/')[-1] for filename in filenames]  # Extract filenames
df_submission = pd.DataFrame({'ID': filenames, 'label': predicted_labels})
df_submission.to_csv('submission.csv', index=False)

Found 1499 images belonging to 1 classes.


  self._warn_if_super_not_called()


[1m1499/1499[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step
