# Define and train the CNN

- expects train, validation and test data to be stored in `train_data`, `validation_data` and `test_data` respectiveley
- the trained model will be saved as `major-minor-model.h5`. A pretrained model is already included

In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint


BATCH_SIZE = 32
SHAPE = (1025, 94)
TRAIN_FOLDER = 'train_data'
VALIDATION_FOLDER = 'validation_data'
TEST_FOLDER = 'test_data'

train_gen = ImageDataGenerator(rescale=1. / 255)
val_gen = ImageDataGenerator(rescale=1. / 255)
test_gen = ImageDataGenerator(rescale=1. / 255)

train_dataset = train_gen.flow_from_directory(
    directory=TRAIN_FOLDER,
    batch_size=BATCH_SIZE,
    target_size=SHAPE,
    color_mode="grayscale",
    class_mode="binary"
)

validation_dataset = val_gen.flow_from_directory(
    directory=VALIDATION_FOLDER,
    batch_size=BATCH_SIZE,
    target_size=SHAPE,
    color_mode="grayscale",
    class_mode="binary"
)

test_dataset = test_gen.flow_from_directory(
    directory=TEST_FOLDER,
    batch_size=BATCH_SIZE,
    target_size=SHAPE,
    color_mode="grayscale",
    class_mode="binary",
    shuffle=False
)

optimizer = keras.optimizers.Adam(learning_rate=1e-3)

model = keras.models.Sequential([
    keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=[SHAPE[0], SHAPE[1], 1]),
    keras.layers.MaxPooling2D(pool_size=2, strides=2),

    keras.layers.Conv2D(64, (3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=2, strides=2),

    keras.layers.Conv2D(128, (3, 3), activation='relu'),
    keras.layers.MaxPooling2D(pool_size=2, strides=2),

    keras.layers.Flatten(),
    keras.layers.Dense(64, activation='relu'),
    keras.layers.Dropout(0.5),

    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='BinaryCrossentropy', optimizer=optimizer, metrics=['accuracy'])

steps_per_epochs = int(np.ceil(train_dataset.samples / BATCH_SIZE))
validation_steps = int(np.ceil(validation_dataset.samples / BATCH_SIZE))

model_file = 'major-minor-model.h'
checkpoint = ModelCheckpoint(model_file, monitor='val_accuracy', verbose=1, save_best_only=True)

model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epochs,
    epochs=100,
    validation_data=validation_dataset,
    validation_steps=validation_steps,
    callbacks=[checkpoint]
)

test_steps = int(np.ceil(test_dataset.samples / BATCH_SIZE))
test_loss, test_accuracy = model.evaluate(test_dataset, steps=test_steps)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

predictions = model.predict(test_dataset, steps=test_steps)
print("Predictions: ", predictions)

## Generate performance data for the model

In [None]:
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import ImageDataGenerator

BATCH_SIZE = 32
SHAPE = (1025, 94)
TEST_FOLDER = 'test_data'
MODEL_FILE = 'major-minor-model.h5'

model = load_model(MODEL_FILE)
test_gen = ImageDataGenerator(rescale=1. / 255)
test_dataset = test_gen.flow_from_directory(
    directory=TEST_FOLDER,
    batch_size=BATCH_SIZE,
    target_size=SHAPE,
    color_mode="grayscale",
    class_mode="binary",
    shuffle=False
)
test_steps = int(np.ceil(test_dataset.samples / BATCH_SIZE))
predictions = model.predict(test_dataset, steps=test_steps)
y_pred = (predictions > 0.5).astype(int)
y_true = test_dataset.classes

filenames = test_dataset.filenames
misclassified_indices = np.where(y_true != y_pred.flatten())[0]
misclassified_files = [filenames[i] for i in misclassified_indices]
misclassified_files = set(misclassified_files)
print(f"number misclassified: ${len(misclassified_files)}")
print("Misclassified Samples:")
for file in misclassified_files:
    print(file)

conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

f1 = f1_score(y_true, y_pred)
print(f"F1 Score: {f1:.2f}")

report = classification_report(y_true, y_pred, target_names=['Minor', 'Major'])
print("Classification Report:")
print(report)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Minor', 'Major'], yticklabels=['Minor', 'Major'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

with open("classification_report.txt", "w") as f:
    f.write(report)