In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import os
import numpy as np

# ---- 1. TPU Configuration ----
try:
    # Detect and initialize TPU
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU:', tpu.cluster_spec().as_dict()['worker'])
    
    # Connect to TPU cluster
    tf.config.experimental_connect_to_cluster(tpu)
    
    # Initialize TPU system
    tf.tpu.experimental.initialize_tpu_system(tpu)
    
    # Create distribution strategy for TPU
    strategy = tf.distribute.TPUStrategy(tpu)
    
    print("TPU detected and configured successfully!")
    print(f"Number of accelerators: {strategy.num_replicas_in_sync}")
    
    # Set mixed precision policy for TPU - Changed from bfloat16 to float16
    tf.keras.mixed_precision.set_global_policy('mixed_float16')
    print("Using mixed precision float16 policy for TPU")
    
    # Print TPU device information
    print("TPU device information:")
    for device in tf.config.list_logical_devices('TPU'):
        print(f" - {device}")
    
except ValueError:
    print("No TPU detected, falling back to GPU/CPU.")
    # Fallback to GPU configuration
    physical_devices = tf.config.list_physical_devices('GPU')
    if physical_devices:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
        strategy = tf.distribute.MirroredStrategy()
        print(f"Using GPU with {strategy.num_replicas_in_sync} device(s)")
    else:
        strategy = tf.distribute.get_strategy()
        print("Using CPU")

# ---- 2. Kaggle Dataset & Model Paths ----
# Kaggle input directory typically contains the dataset
DATASET_PATH = "../input/fruits"  # Adjust if needed
MODEL_PATH = "./mobilenet_fruits360_optimized.h5"
CHECKPOINT_PATH = "./checkpoints/model_checkpoint.h5"

# Create checkpoint directory if it doesn't exist
os.makedirs(os.path.dirname(CHECKPOINT_PATH), exist_ok=True)

# ---- 3. Check if Dataset exists ----
if not os.path.exists(DATASET_PATH):
    print(f"Dataset not found at {DATASET_PATH}")
    print("Please make sure to add the 'fruits-360-dataset' to your Kaggle notebook.")
    # Check common alternate locations in Kaggle
    alt_paths = [
        "../input/fruits-360",
        "../input/fruit-images-for-object-detection",
        "../input/fruits-360_dataset"
    ]
    for path in alt_paths:
        if os.path.exists(path):
            print(f"Found dataset at alternate location: {path}")
            DATASET_PATH = path
            break

# ---- 4. Adjust batch size to be divisible by TPU cores ----
# Make batch size divisible by replica count - important for TPU
BATCH_SIZE = 128 * strategy.num_replicas_in_sync  # Base batch size per replica = 128
print(f"Using TPU-optimized batch size: {BATCH_SIZE}")
IMG_SIZE = 96

# ---- 5. TPU-Optimized Dataset Loading ----
# Try to find the correct training data directory
possible_data_dirs = [
    DATASET_PATH,
    os.path.join(DATASET_PATH, "fruits-360"),
    os.path.join(DATASET_PATH, "fruits-360_dataset", "fruits-360"),
    os.path.join(DATASET_PATH, "Training")
]

data_dir_found = False
for dir_path in possible_data_dirs:
    if os.path.exists(dir_path):
        # Check if this directory contains subdirectories (classes)
        subdirs = [f for f in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, f))]
        if subdirs:
            print(f"Found valid dataset directory: {dir_path}")
            DATASET_PATH = dir_path
            data_dir_found = True
            break

if not data_dir_found:
    print("Warning: Could not automatically find the correct dataset directory structure.")
    print("Please verify the dataset path and structure manually.")

# ---- 6. TPU-Optimized Dataset Creation ----
# Using the file-based approach which is better for TPUs
try:
    # Check if we have a Training/Test directory structure first
    TRAIN_DIR = os.path.join(DATASET_PATH, "Training")
    TEST_DIR = os.path.join(DATASET_PATH, "Test")
    
    if os.path.exists(TRAIN_DIR) and os.path.exists(TEST_DIR):
        print("Using Training/Test directory structure")
        
        # Get all image files and classes
        import glob
        from sklearn.model_selection import train_test_split
        
        all_images = []
        all_labels = []
        class_dirs = [d for d in os.listdir(TRAIN_DIR) if os.path.isdir(os.path.join(TRAIN_DIR, d))]
        class_to_idx = {cls_name: i for i, cls_name in enumerate(class_dirs)}
        
        for cls_name in class_dirs:
            cls_path = os.path.join(TRAIN_DIR, cls_name)
            for img_path in glob.glob(os.path.join(cls_path, "*.jpg")):
                all_images.append(img_path)
                all_labels.append(class_to_idx[cls_name])
        
        # Split with 90% training, 10% validation
        train_imgs, val_imgs, train_labels, val_labels = train_test_split(
            all_images, all_labels, test_size=0.1, stratify=all_labels, random_state=42
        )
        
        print(f"Manual split created: {len(train_imgs)} training images, {len(val_imgs)} validation images")
        num_classes = len(class_dirs)
        
    else:
        # Alternate approach - flat directory with all classes
        print("Using flat directory structure with all classes")
        
        # Process all classes
        all_images = []
        all_labels = []
        class_dirs = [d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))]
        class_to_idx = {cls_name: i for i, cls_name in enumerate(class_dirs)}
        
        for cls_name in class_dirs:
            cls_path = os.path.join(DATASET_PATH, cls_name)
            for img_path in glob.glob(os.path.join(cls_path, "*.jpg")):
                all_images.append(img_path)
                all_labels.append(class_to_idx[cls_name])
        
        # Check if we found any images
        if not all_images:
            raise Exception("No images found in the dataset directory")
            
        # Split with 90% training, 10% validation
        train_imgs, val_imgs, train_labels, val_labels = train_test_split(
            all_images, all_labels, test_size=0.1, stratify=all_labels, random_state=42
        )
        
        print(f"Manual split created: {len(train_imgs)} training images, {len(val_imgs)} validation images")
        num_classes = len(class_dirs)

    # TPU-optimized function to load and process images
    def decode_img(file_path):
        img = tf.io.read_file(file_path)
        # Detect the image format
        img = tf.image.decode_image(img, channels=3, expand_animations=False)
        img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE])
        img = tf.cast(img, tf.float32) / 255.0  # Normalize to [0,1]
        return img

    # Create TPU-optimized datasets from file paths
    def create_tpu_optimized_dataset(image_paths, labels, num_classes, is_training=True):
        # Convert Python lists to TensorFlow tensors
        paths_ds = tf.data.Dataset.from_tensor_slices(image_paths)
        labels_ds = tf.data.Dataset.from_tensor_slices(labels)
        
        # Create a dataset of (path, label) pairs
        dataset = tf.data.Dataset.zip((paths_ds, labels_ds))
        
        # Shuffle if training
        if is_training:
            dataset = dataset.shuffle(buffer_size=min(10000, len(image_paths)))
            
        # Map function to process each item
        def process_path(file_path, label):
            img = decode_img(file_path)
            
            # Data augmentation for training
            if is_training:
                img = tf.image.random_flip_left_right(img)
                img = tf.image.random_brightness(img, 0.2)
                img = tf.image.random_contrast(img, 0.8, 1.2)
            
            # Apply MobileNetV2 preprocessing
            img = tf.keras.applications.mobilenet_v2.preprocess_input(img * 255.0)
            
            # One-hot encode the label
            label = tf.one_hot(label, depth=num_classes)
            return img, label
            
        # Apply processing function to each item
        dataset = dataset.map(process_path, num_parallel_calls=tf.data.AUTOTUNE)
        
        # Batch the data
        dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)  # Important for TPU: drop_remainder=True
        
        # Use caching for better performance
        dataset = dataset.cache()
        
        # Prefetch for better performance
        dataset = dataset.prefetch(tf.data.AUTOTUNE)
        
        return dataset
    
    # Create the datasets
    train_ds = create_tpu_optimized_dataset(train_imgs, train_labels, num_classes, is_training=True)
    val_ds = create_tpu_optimized_dataset(val_imgs, val_labels, num_classes, is_training=False)
    
    # Calculate steps
    steps_per_epoch = len(train_imgs) // BATCH_SIZE
    validation_steps = len(val_imgs) // BATCH_SIZE
    
    # Ensure at least one step
    steps_per_epoch = max(1, steps_per_epoch)
    validation_steps = max(1, validation_steps)
    
    print(f"Dataset prepared for TPU:")
    print(f"Number of classes: {num_classes}")
    print(f"Training samples: {len(train_imgs)}")
    print(f"Validation samples: {len(val_imgs)}")
    print(f"Steps per epoch: {steps_per_epoch}")
    print(f"Validation steps: {validation_steps}")
    
except Exception as e:
    print(f"Error setting up dataset: {e}")
    raise

# ---- 7. Define Model Creation Function within Strategy Scope ----
# TPU Strategy Scope for model creation
def create_model():
    # Use smaller input size and alpha parameter for faster inference
    base_model = MobileNetV2(
        weights="imagenet", 
        include_top=False, 
        input_shape=(IMG_SIZE, IMG_SIZE, 3),
        alpha=0.75  # Smaller network (75% of filters)
    )

    # Freeze base model for initial training
    base_model.trainable = False

    # Efficient Model Head
    x = base_model.output
    x = GlobalAveragePooling2D(name="gap")(x)
    x = Dense(128, activation="relu", name="dense_1")(x)
    x = Dropout(0.4, name="dropout_1")(x)
    # Force float32 output for TPU compatibility
    output_layer = Dense(num_classes, activation="softmax", dtype='float32', name="output")(x)

    model = Model(inputs=base_model.input, outputs=output_layer)
    
    # Learning rate schedule for better convergence
    initial_learning_rate = 0.001
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate,
        decay_steps=steps_per_epoch*2,
        decay_rate=0.9,
        staircase=True
    )

    # TPU-optimized compilation
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
        loss="categorical_crossentropy",
        metrics=["accuracy", tf.keras.metrics.TopKCategoricalAccuracy(k=3, name="top3_acc")]
    )
    
    return model, base_model

# Create model inside TPU strategy scope
with strategy.scope():
    model, base_model = create_model()

# Summary of model architecture
print("Model Architecture Summary:")
model.summary()

# ---- 8. Callbacks for Better Training ----
# Ensure TPU compatibility for callbacks - simplified for troubleshooting
callbacks = [
    # Save model checkpoints
    ModelCheckpoint(
        filepath=CHECKPOINT_PATH,
        save_best_only=True,
        monitor='val_accuracy',
        mode='max'
    ),
    
    # Reduced patience for learning rate reduction
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=1, min_lr=1e-6)
]

# ---- 9. First try a minimal test to check TPU compatibility ----
print("\nRunning a minimal test on TPU to check compatibility...")
try:
    # Take just one batch and run for one epoch as a test
    test_train_ds = train_ds.take(1).repeat(1)
    test_val_ds = val_ds.take(1).repeat(1)
    
    test_history = model.fit(
        test_train_ds,
        epochs=1,
        steps_per_epoch=1,
        validation_data=test_val_ds,
        validation_steps=1
    )
    
    print("TPU test successful! Proceeding with full training.")
except Exception as e:
    print(f"TPU test failed: {e}")
    print("Trying workaround for TPU compatibility...")
    
    # Try re-initializing TPU strategy with different settings
    try:
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.TPUStrategy(tpu)
        
        # Recreate model with new strategy
        with strategy.scope():
            model, base_model = create_model()
            
        # Try test again
        test_train_ds = train_ds.take(1).repeat(1)
        test_history = model.fit(
            test_train_ds,
            epochs=1,
            steps_per_epoch=1,
            validation_data=None
        )
        
        print("TPU workaround successful! Proceeding with full training.")
    except Exception as e2:
        print(f"TPU workaround also failed: {e2}")
        print("Falling back to CPU training...")
        strategy = tf.distribute.get_strategy()
        
        # Recreate model with CPU strategy
        with strategy.scope():
            model, base_model = create_model()

# ---- 10. Initial Training Phase ----
print("\nStarting initial training phase...")
try:
    history = model.fit(
        train_ds,
        epochs=5,
        validation_data=val_ds,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps
    )
    
    print("Initial training phase completed successfully!")
except Exception as e:
    print(f"Error during initial training: {e}")
    # Try to diagnose the issue
    print("\nTrying to diagnose the issue...")
    
    print("\nAttempting to run a single batch training with no validation:")
    try:
        # Take a single batch and run one step
        single_batch = next(iter(train_ds))
        loss = model.train_on_batch(single_batch[0], single_batch[1])
        print(f"Single batch training successful! Loss: {loss}")
    except Exception as batch_err:
        print(f"Single batch training failed: {batch_err}")
    
    # If we get here, something is seriously wrong - try recreating with simpler model
    print("\nAttempting to create a simpler model:")
    try:
        with strategy.scope():
            simple_model = tf.keras.Sequential([
                tf.keras.layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3)),
                tf.keras.layers.GlobalAveragePooling2D(),
                tf.keras.layers.Dense(num_classes, activation='softmax', dtype='float32')
            ])
            
            simple_model.compile(
                optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )
            
            # Try to fit the simple model
            simple_history = simple_model.fit(
                train_ds,
                epochs=1,
                steps_per_epoch=1
            )
            print("Simple model training successful!")
    except Exception as simple_err:
        print(f"Simple model training failed: {simple_err}")
        
    # At this point, if all attempts failed, we need to raise an exception
    raise Exception("All training attempts failed. Please check your TPU configuration and dataset.")

# ---- 11. Selective Fine-Tuning ----
print("\nStarting fine-tuning phase...")
try:
    # Need to update the model inside TPU strategy scope
    with strategy.scope():
        # Unfreeze the last block of the MobileNetV2 model
        for layer in base_model.layers[-12:]:
            layer.trainable = True

        # Count trainable parameters
        trainable_count = sum(tf.keras.backend.count_params(w) for w in model.trainable_weights)
        non_trainable_count = sum(tf.keras.backend.count_params(w) for w in model.non_trainable_weights)
        print(f"Trainable parameters: {trainable_count:,}")
        print(f"Non-trainable parameters: {non_trainable_count:,}")

        # Use a much smaller learning rate for fine-tuning
        model.compile(
            optimizer=keras.optimizers.Adam(learning_rate=1e-5),
            loss="categorical_crossentropy",
            metrics=["accuracy", tf.keras.metrics.TopKCategoricalAccuracy(k=3, name="top3_acc")]
        )

    # Fine-tune with early stopping
    history_finetune = model.fit(
        train_ds,
        epochs=5,
        validation_data=val_ds,
        callbacks=callbacks,
        steps_per_epoch=steps_per_epoch,
        validation_steps=validation_steps
    )
    
    print("Fine-tuning phase completed successfully!")
except Exception as e:
    print(f"Error during fine-tuning: {e}")
    print("Skipping fine-tuning phase.")
    history_finetune = None

# ---- 12. Evaluation ----
print("\nEvaluating model on validation set...")
try:
    evaluation = model.evaluate(val_ds, steps=validation_steps)
    print(f"Final validation loss: {evaluation[0]:.4f}")
    print(f"Final validation accuracy: {evaluation[1]:.4f}")
    print(f"Final validation top-3 accuracy: {evaluation[2]:.4f}")
except Exception as e:
    print(f"Error during evaluation: {e}")

# ---- 13. Save Models ----
# Save the Keras model to Kaggle's output directory
try:
    model.save(MODEL_PATH)
    print(f"Saved Keras model to {MODEL_PATH}")

    # Convert to TensorFlow Lite for deployment
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_model = converter.convert()

    # Save the TF Lite model
    tflite_path = os.path.join(os.path.dirname(MODEL_PATH), 'model.tflite')
    with open(tflite_path, 'wb') as f:
        f.write(tflite_model)
    print(f"Saved TFLite model to {tflite_path}")
except Exception as e:
    print(f"Error saving models: {e}")

# ---- 14. Output class indices for later use ----
# Save the class indices for inference
try:
    import json
    
    # Get the class indices
    if 'class_to_idx' in locals():
        class_indices = {cls: idx for cls, idx in class_to_idx.items()}
    else:
        class_indices = {}
    
    # Save to file
    with open('class_indices.json', 'w') as f:
        json.dump(class_indices, f)
    print("Saved class indices to class_indices.json")
except Exception as e:
    print(f"Error saving class indices: {e}")

# ---- 15. Sample prediction code ----
print("\nSample code for making predictions:")
print("""
# Code to load and use the model for prediction
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing import image
import json

# Load the model
model = tf.keras.models.load_model('mobilenet_fruits360_optimized.h5')

# Load class indices
with open('class_indices.json', 'r') as f:
    class_indices = json.load(f)
    
# Invert the dictionary to map indices to class names
idx_to_class = {v: k for k, v in class_indices.items()}

# Function to preprocess and predict
def predict_fruit(img_path):
    img = image.load_img(img_path, target_size=(96, 96))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = tf.keras.applications.mobilenet_v2.preprocess_input(img_array)
    
    predictions = model.predict(img_array)
    predicted_class_idx = np.argmax(predictions[0])
    confidence = predictions[0][predicted_class_idx] * 100
    
    return idx_to_class[predicted_class_idx], confidence

# Example usage
# fruit_name, confidence = predict_fruit('path/to/your/fruit/image.jpg')
# print(f'Predicted fruit: {fruit_name} with {confidence:.2f}% confidence')
""")

print("\nTraining and optimization complete!")

# ---- 16. Create a simple visualization of training history ----
try:
    import matplotlib.pyplot as plt
    
    # Only create plots if training was successful
    if 'history' in locals() and history is not None:
        # Plot training & validation accuracy
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.plot(history.history['accuracy'])
        plt.plot(history.history['val_accuracy'])
        plt.title('Model Accuracy (Initial Training)')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Validation'], loc='upper left')
        
        if history_finetune is not None:
            plt.subplot(1, 2, 2)
            plt.plot(history_finetune.history['accuracy'])
            plt.plot(history_finetune.history['val_accuracy'])
            plt.title('Model Accuracy (Fine-tuning)')
            plt.ylabel('Accuracy')
            plt.xlabel('Epoch')
            plt.legend(['Train', 'Validation'], loc='upper left')
        
        plt.tight_layout()
        plt.savefig('training_history.png')
        print("Saved training history visualization to 'training_history.png'")
except Exception as e:
    print(f"Could not create visualization: {e}")

No TPU detected, falling back to GPU/CPU.
Using CPU
Found valid dataset directory: ../input/fruits
Found 124595 images belonging to 4 classes.
Found 13842 images belonging to 4 classes.
Number of classes: 4
Training samples: 124595
Validation samples: 13842
Steps per epoch: 243
Validation steps: 27
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_0.75_96_no_top.h5
[1m5903360/5903360[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Model Architecture Summary:



Starting initial training phase on TPU...
Epoch 1/5
Found 124595 images belonging to 4 classes.


NotFoundError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code

  File "/usr/local/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 737, in start

  File "/usr/local/lib/python3.10/site-packages/anyio/_core/_eventloop.py", line 74, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 2303, in run

  File "/usr/local/lib/python3.10/site-packages/anyio/_backends/_asyncio.py", line 216, in run

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 636, in run_until_complete

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/local/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/local/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 428, in process_shell

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 501, in process_shell_message

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 337, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 752, in execute_request

  File "/usr/local/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 433, in do_execute

  File "/usr/local/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 582, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/usr/local/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/usr/local/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_10/1821820589.py", line 337, in <module>

  File "/usr/local/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

could not find registered transfer manager for platform Host -- check target linkage
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_8970]