# Imports and Constants
This cell imports the necessary libraries and sets up constants for paths, image size, batch size, number of epochs, and K-Fold splits. 
- Libraries include **TensorFlow/Keras** for deep learning, OpenCV for image processing, and utilities for metrics and plotting.
- Paths specify the directories for training, testing data, and labels.


In [None]:
import os
import numpy as np
import pandas as pd
import cv2
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.callbacks import EarlyStopping
import json
import itertools

# Paths
TRAIN_DIR = '/kaggle/input/preprocessed-data/traffic/trainNew'
TEST_DIR = '/kaggle/input/preprocessed-data/traffic/testNew'
LABELS_PATH = '/kaggle/input/labels-csv/labels.csv'

# Parameters
IMAGE_SIZE = (100, 100)  # Resize images
BATCH_SIZE = 32
EPOCHS = 100
KFOLDS = 10

print("Imports and constants loaded.")

# Load Labels and Helper Functions
- Loads the label mapping from a CSV file, mapping each class ID to its corresponding name.
- Defines a helper function `load_data` to load images from the training or test directory. The function:
  - Reads and resizes images to a standard size.
  - Normalizes the image data.
  - Returns NumPy arrays of image data and corresponding labels.


In [None]:
# Load labels
labels_df = pd.read_csv(LABELS_PATH)
class_mapping = dict(zip(labels_df['ClassId'], labels_df['Name']))

# Function to load data
def load_data(data_dir, image_size, is_test=False):
    data, labels = [], []
    if is_test:
        for img_name in os.listdir(data_dir):
            img_path = os.path.join(data_dir, img_name)
            try:
                label = int(img_name.split("_")[1])  # Adjust based on filename structure
            except ValueError:
                continue
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is not None:
                img = cv2.resize(img, image_size)
                data.append(img)
                labels.append(label)
    else:
        for class_name in os.listdir(data_dir):
            class_dir = os.path.join(data_dir, class_name)
            label = int(class_name)
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    img = cv2.resize(img, image_size)
                    data.append(img)
                    labels.append(label)
    data = np.array(data).reshape(-1, image_size[0], image_size[1], 1) / 255.0
    labels = np.array(labels)
    return data, labels

print("Helper functions loaded.")

# Load Data
- Loads the training and testing datasets using the `load_data` function.
- One-hot encodes the training and testing labels for use in categorical classification.
- Splits the training data into training and validation sets (80/20 split).
- Prints the shapes of the training, validation, and test datasets, along with the number of classes.


In [None]:
print("Loading training data...")
X_train, Y_train = load_data(TRAIN_DIR, IMAGE_SIZE)
NUM_CLASSES = np.max(Y_train) + 1
Y_train_one_hot = to_categorical(Y_train, NUM_CLASSES)

print("Loading test data...")
X_test, Y_test = load_data(TEST_DIR, IMAGE_SIZE, is_test=True)
Y_test_one_hot = to_categorical(Y_test, NUM_CLASSES)

# Split training data for validation
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train_one_hot, test_size=0.2, random_state=42)

print(f"Training data: {X_train.shape}, Test data: {X_test.shape}, Classes: {NUM_CLASSES}")


# CNN Model Creator Function
Defines the function `create_cnn_model` to create a customizable CNN architecture. It allows for:
- Adjustable number of convolutional layers and filters.
- Configurable dense layers, units, kernel size, and dropout rate.
- Compilation using the Adam optimizer with categorical cross-entropy loss and accuracy as a metric.


In [None]:
# Function to create a CNN model dynamically
def create_cnn_model(input_shape, num_classes, conv_layers, filters, kernel_size, dense_layers, dense_units, dropout_rate):
    model = Sequential()
    for i in range(conv_layers):
        if i == 0:
            model.add(Conv2D(filters, kernel_size, activation='relu', input_shape=input_shape))
        else:
            model.add(Conv2D(filters * (2 ** i), kernel_size, activation='relu'))
        model.add(MaxPooling2D((2, 2)))
        model.add(Dropout(dropout_rate))
    model.add(Flatten())
    for _ in range(dense_layers):
        model.add(Dense(dense_units, activation='relu'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

print("Model generator ready.")


# Hyperparameter Grid
- Defines a grid of hyperparameters for tuning the CNN model.
- Uses `itertools.product` to generate all possible combinations of parameters (e.g., layers, filters, dropout rate).
- Saves the combinations to a JSON file for subsequent testing.


In [None]:
# Hyperparameter grid
param_grid = {
    "conv_layers": [4],
    "filters": [32, 64],
    "kernel_size": [(3, 3)],
    "dense_layers": [1, 2],
    "dense_units": [512],
    "dropout_rate": [0.5]
}

# Generate all combinations and save to a file
param_combinations = list(itertools.product(
    param_grid["conv_layers"],
    param_grid["filters"],
    param_grid["kernel_size"],
    param_grid["dense_layers"],
    param_grid["dense_units"],
    param_grid["dropout_rate"]
))
with open("/kaggle/working/param_combinations.json", "w") as f:
    json.dump(param_combinations, f)

print(f"{len(param_combinations)} parameter combinations saved.")


# Evaluate a Parameter Combination with K-Fold Cross-Validation
Defines the `evaluate_combination` function, which:
- Performs K-Fold Cross-Validation to split data into training and validation folds.
- Trains the model on `n-1` folds and evaluates it on the remaining fold.
- Tracks metrics (accuracy, precision, recall, F1 score) for each fold.
- Returns the average metrics across folds.


In [None]:
# Function to evaluate a parameter combination using K-Fold Cross-Validation
def evaluate_combination(params, X, Y, input_shape, num_classes):
    conv_layers, filters, kernel_size, dense_layers, dense_units, dropout_rate = params
    kfold = KFold(n_splits=KFOLDS, shuffle=True, random_state=42)
    fold_metrics = []

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X)):
        print(f"Fold {fold + 1}/{KFOLDS}")
        X_fold_train, X_fold_val = X[train_idx], X[val_idx]
        Y_fold_train, Y_fold_val = Y[train_idx], Y[val_idx]

        model = create_cnn_model(input_shape, num_classes, conv_layers, filters, kernel_size, dense_layers, dense_units, dropout_rate)
        early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

        model.fit(X_fold_train, Y_fold_train, validation_data=(X_fold_val, Y_fold_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=0, callbacks=[early_stopping])

        Y_val_pred = model.predict(X_fold_val)
        Y_val_pred_classes = np.argmax(Y_val_pred, axis=1)
        Y_val_true_classes = np.argmax(Y_fold_val, axis=1)

        metrics = {
            "accuracy": accuracy_score(Y_val_true_classes, Y_val_pred_classes),
            "precision": precision_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
            "recall": recall_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
            "f1_score": f1_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
        }
        fold_metrics.append(metrics)
        print(f"Metrics for Fold {fold + 1}: {metrics}")

    avg_metrics = {k: np.mean([m[k] for m in fold_metrics]) for k in fold_metrics[0]}
    return avg_metrics

print("Evaluation function ready.")


# Load Parameter Combinations and Evaluate
- Loads hyperparameter combinations from the JSON file.
- Evaluates the first five parameter combinations using the `evaluate_combination` function.
- Saves the results and metrics for each combination to separate JSON files.


In [None]:
# Load parameter combinations
with open("/kaggle/working/param_combinations.json", "r") as f:
    param_combinations = json.load(f)

results = []
for idx, params in enumerate(param_combinations[:5]):  # Limit to first 5 for faster testing
    print(f"Evaluating combination {idx + 1}/{len(param_combinations)}: {params}")
    metrics = evaluate_combination(params, X_train, Y_train, X_train[0].shape, NUM_CLASSES)
    results.append({"params": params, "metrics": metrics})
    with open(f"/kaggle/working/results_{idx + 1}.json", "w") as f:
        json.dump({"params": params, "metrics": metrics}, f)

print("Grid search complete.")


# Test the Best Model
- Identifies the best parameter combination based on F1 score from the results.
- Trains the model with the best parameters on the full training set.
- Evaluates the model on the test set and prints the test metrics.


In [None]:
# Test the best model
best_params = max(results, key=lambda x: x["metrics"]["f1_score"])["params"]
print(f"Best parameters: {best_params}")

model = create_cnn_model(X_train[0].shape, NUM_CLASSES, *best_params)
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
test_metrics = model.evaluate(X_test, Y_test_one_hot, verbose=1)
print(f"Test metrics: {test_metrics}")


# Save the Best Model
- Trains the model using the best parameter combination on the full dataset.
- Saves the trained model to a file for future use.


In [None]:
from tensorflow.keras.models import load_model

# Save the best model
best_params = max(results, key=lambda x: x["metrics"]["f1_score"])["params"]
print(f"Best parameters: {best_params}")

# Train the model with best parameters on the full training data
best_model = create_cnn_model(X_train[0].shape, NUM_CLASSES, *best_params)
best_model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

# Save the trained model
model_save_path = "/kaggle/working/best_model.h5"
best_model.save(model_save_path)
print(f"Best model saved to {model_save_path}")


# Evaluate Model on Validation Data
Defines the `evaluate_model_on_validation` function to:
- Predict class labels for validation or test data.
- Compute accuracy, precision, recall, and F1 score.
- Return the metrics for evaluation.


In [None]:
# Function to evaluate a model on validation or test data
def evaluate_model_on_validation(model, X_val, Y_val):
    Y_val_pred = model.predict(X_val)
    Y_val_pred_classes = np.argmax(Y_val_pred, axis=1)
    Y_val_true_classes = np.argmax(Y_val, axis=1)

    metrics = {
        "accuracy": accuracy_score(Y_val_true_classes, Y_val_pred_classes),
        "precision": precision_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
        "recall": recall_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
        "f1_score": f1_score(Y_val_true_classes, Y_val_pred_classes, average='weighted'),
    }
    return metrics


# Compare Saved and Current Models
- Loads the previously saved model and evaluates it on the validation set.
- Compares its F1 score with the current model's score.
- Replaces the saved model if the current model performs better.


In [None]:
import os
from tensorflow.keras.models import load_model

# Filepath for the saved model
model_save_path = "/kaggle/working/best_model.h5"

# Check if the last saved model exists
if os.path.exists(model_save_path):
    print("Loading last saved model for comparison...")
    last_saved_model = load_model(model_save_path)
    last_metrics = evaluate_model_on_validation(last_saved_model, X_val, Y_val)
    print(f"Last saved model metrics: {last_metrics}")
else:
    print("No previously saved model found.")
    last_metrics = None

# Train the current best model
current_model = create_cnn_model(X_train[0].shape, NUM_CLASSES, *best_params)
current_model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)

# Evaluate the current model
current_metrics = evaluate_model_on_validation(current_model, X_val, Y_val)
print(f"Current model metrics: {current_metrics}")

# Compare F1 scores and replace if the current model is better
if last_metrics is None or current_metrics["f1_score"] > last_metrics["f1_score"]:
    print("Current model is better. Replacing the last saved model.")
    current_model.save(model_save_path)
else:
    print("Last saved model is better. Keeping the previous model.")


# Plot Training History
Defines a function `plot_training_history` to visualize:
- Training vs. validation accuracy over epochs.
- Training vs. validation loss over epochs.
This helps analyze model convergence and diagnose overfitting or underfitting.


In [None]:
import matplotlib.pyplot as plt

# Assuming history is the training history object from the final fold

def plot_training_history(history):
    """Plots training and validation accuracy and loss over epochs."""
    # Extract metrics
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    # Plot accuracy
    plt.figure(figsize=(12, 5))

    # Training vs Validation Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Training vs Validation Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, label='Training Loss')
    plt.plot(epochs, val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


plot_training_history(final_fold_history)
