In [None]:
from datasets import load_dataset

ds = load_dataset("dpdl-benchmark/plant_village")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/368 [00:00<?, ?B/s]



data/train-00000-of-00013.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00001-of-00013.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00002-of-00013.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

data/train-00003-of-00013.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

data/train-00004-of-00013.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00005-of-00013.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00006-of-00013.parquet:   0%|          | 0.00/487M [00:00<?, ?B/s]

data/train-00007-of-00013.parquet:   0%|          | 0.00/483M [00:00<?, ?B/s]

data/train-00008-of-00013.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00009-of-00013.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

data/train-00010-of-00013.parquet:   0%|          | 0.00/482M [00:00<?, ?B/s]

data/train-00011-of-00013.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00012-of-00013.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/54303 [00:00<?, ? examples/s]

In [None]:
!pip install flask flask-cors pyngrok tensorflow pillow numpy

Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [None]:
import os
import sys
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
import kagglehub

# ==========================================
# 1. Data Collection & Path Configuration
# ==========================================
print("Downloading dataset via kagglehub...")
dataset_path = kagglehub.dataset_download("vipoooool/new-plant-diseases-dataset")
print("Path to dataset files:", dataset_path)

train_dir = None
valid_dir = None

# Dynamically locate the train and valid directories to avoid hardcoded path errors
for root, dirs, files in os.walk(dataset_path):
    if 'train' in dirs and 'valid' in dirs:
        train_dir = os.path.join(root, 'train')
        valid_dir = os.path.join(root, 'valid')
        break

if not train_dir or not valid_dir:
    print("Error: Could not find 'train' and 'valid' directories within the downloaded dataset.")
    sys.exit(1)

print(f"Train directory found: {train_dir}")
print(f"Valid directory found: {valid_dir}")

# Configuration variables
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 5
N_LAST_LAYERS = 10
NUM_CLASSES = 38
SEED = 1337

# ==========================================
# 2. Data Preprocessing & Augmentation
# ==========================================
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input,
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    fill_mode='reflect'
)

valid_datagen = ImageDataGenerator(preprocessing_function=preprocess_input)

train_gen = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=True,
    seed=SEED
)

valid_gen = valid_datagen.flow_from_directory(
    valid_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    shuffle=False
)

# ==========================================
# 3. Model Building (Transfer Learning)
# ==========================================
base_model = MobileNetV2(
    input_shape=IMG_SIZE + (3,),
    include_top=False,
    weights='imagenet'
)

# Freeze layers and unfreeze the last N layers for fine-tuning
for layer in base_model.layers:
    layer.trainable = False

if N_LAST_LAYERS > 0:
    for layer in base_model.layers[-N_LAST_LAYERS:]:
        layer.trainable = True

# Build the complete architecture
inputs = keras.Input(shape=IMG_SIZE + (3,))
x = base_model(inputs, training=False)
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dropout(0.35)(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.25)(x)
outputs = layers.Dense(NUM_CLASSES, activation='softmax')(x)

model = keras.Model(inputs, outputs, name="mobilenetv2_plant_disease_classifier")

# Compile the model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# ==========================================
# 4. Model Training & Callbacks
# ==========================================
callbacks = [
    keras.callbacks.ModelCheckpoint(
        'mobilenetv2_best.keras', # Saving locally in the working directory
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        verbose=1
    ),
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=6,
        restore_best_weights=True,
        verbose=1
    )
]

history = model.fit(
    train_gen,
    epochs=EPOCHS,
    validation_data=valid_gen,
    callbacks=callbacks
)

# Save the final model explicitly
final_path = "mobilenetv2_final.keras"
model.save(final_path)
print(f"Saved final model to: {final_path}")

Downloading dataset via kagglehub...
Using Colab cache for faster access to the 'new-plant-diseases-dataset' dataset.
Path to dataset files: /kaggle/input/new-plant-diseases-dataset
Train directory found: /kaggle/input/new-plant-diseases-dataset/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/train
Valid directory found: /kaggle/input/new-plant-diseases-dataset/New Plant Diseases Dataset(Augmented)/New Plant Diseases Dataset(Augmented)/valid
Found 70295 images belonging to 38 classes.
Found 17572 images belonging to 38 classes.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5
[1m9406464/9406464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


  self._warn_if_super_not_called()


Epoch 1/5
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516ms/step - accuracy: 0.6669 - loss: 1.2280
Epoch 1: val_accuracy improved from -inf to 0.88550, saving model to mobilenetv2_best.keras
[1m2197/2197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1259s[0m 563ms/step - accuracy: 0.6670 - loss: 1.2277 - val_accuracy: 0.8855 - val_loss: 0.4073 - learning_rate: 1.0000e-04
Epoch 2/5
[1m1679/2197[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m3:29[0m 404ms/step - accuracy: 0.9174 - loss: 0.2626