In [11]:
import argparse
import os
import zipfile
import requests
import pandas as pd
import numpy as np
from time import time
from datetime import datetime
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard, CSVLogger
from keras.optimizers import Adam
from keras.models import Model, load_model
from keras import backend as K
from skimage.io import imread
from skimage.transform import resize
from sklearn.metrics import confusion_matrix, classification_report
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.inception_v3 import InceptionV3
from keras.layers import Dense, GlobalAveragePooling2D
import csv

# Global paths
OUTPUT_DIRECTORY = "./outputs/"
LABEL_DIRECTORY = "./labels/"
MODEL_DIRECTORY = "./models/"
MODEL_GD_ID = "1MRbN5hXOTYnw7-71K-2vjY01uJ9GkQM5"
MODEL_ZIP_FILE = "./models/models.zip"
IMG_DIRECTORY = "./images/"
IMG_GD_ID = "1xnK3B6K6KekDI55vwJ0vnc2IGoDga9cj"
IMG_ZIP_FILE = "./images/images.zip"

# Global variables
RAW_IMG_SIZE = (256, 256)
IMG_SIZE = (224, 224)
INPUT_SHAPE = (IMG_SIZE[0], IMG_SIZE[1], 3)
MAX_EPOCH = 200
BATCH_SIZE = 32
FOLDS = 5
STOPPING_PATIENCE = 32
LR_PATIENCE = 16
INITIAL_LR = 0.0001
CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8]
CLASS_NAMES = ['Chinee Apple', 'Lantana', 'Parkinsonia', 'Parthenium', 
               'Prickly Acacia', 'Rubber Vine', 'Siam Weed', 'Snake Weed', 'Negatives']

def download_google_drive_file(file_id, destination):
    """Download a file from Google Drive."""
    URL = "https://docs.google.com/uc?export=download"
    session = requests.Session()
    response = session.get(URL, params={'id': file_id}, stream=True)
    token = get_confirm_token(response)
    if token:
        params = {'id': file_id, 'confirm': token}
        response = session.get(URL, params=params, stream=True)
    save_response_content(response, destination)

def get_confirm_token(response):
    """Extract confirmation token from response cookies."""
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination):
    """Save response content to a file."""
    CHUNK_SIZE = 32768
    os.makedirs(os.path.dirname(destination), exist_ok=True)
    try:
        with open(destination, "wb") as f:
            for chunk in response.iter_content(CHUNK_SIZE):
                if chunk:
                    f.write(chunk)
    except Exception as e:
        raise Exception(f"Failed to save file to {destination}: {e}")

def validate_zip_file(file_path):
    """Check if a file is a valid ZIP file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"{file_path} does not exist.")
    if not zipfile.is_zipfile(file_path):
        raise ValueError(f"{file_path} is not a valid ZIP file. Check the download URL or file integrity.")

def download_images():
    """Download and unzip DeepWeeds images."""
    if not os.path.exists(IMG_DIRECTORY):
        os.makedirs(IMG_DIRECTORY)
        print(f"Downloading DeepWeeds images to {IMG_ZIP_FILE}")
        try:
            download_google_drive_file(IMG_GD_ID, IMG_ZIP_FILE)
            print("Finished downloading images.")
            validate_zip_file(IMG_ZIP_FILE)
            print(f"Unzipping {IMG_ZIP_FILE}")
            with zipfile.ZipFile(IMG_ZIP_FILE, "r") as zip_ref:
                zip_ref.extractall(IMG_DIRECTORY)
            print("Finished unzipping images.")
        except Exception as e:
            raise Exception(f"Error downloading or unzipping images: {e}")

def download_models():
    """Download and unzip DeepWeeds models."""
    if not os.path.exists(MODEL_DIRECTORY):
        os.makedirs(MODEL_DIRECTORY)
        print(f"Downloading DeepWeeds models to {MODEL_ZIP_FILE}")
        try:
            download_google_drive_file(MODEL_GD_ID, MODEL_ZIP_FILE)
            print("Finished downloading models.")
            validate_zip_file(MODEL_ZIP_FILE)
            print(f"Unzipping {MODEL_ZIP_FILE}")
            with zipfile.ZipFile(MODEL_ZIP_FILE, "r") as zip_ref:
                zip_ref.extractall(MODEL_DIRECTORY)
            print("Finished unzipping models.")
        except Exception as e:
            raise Exception(f"Error downloading or unzipping models: {e}")

def crop(img, size):
    """Crop the image concentrically to the desired size."""
    h, w, c = img.shape
    x = int((w - size[0]) / 2)
    y = int((h - size[1]) / 2)
    return img[y:(y + size[1]), x:(x + size[0]), :]

def crop_generator(batches, size):
    """Generate random crops from image batches."""
    while True:
        batch_x, batch_y = next(batches)
        b, h, w, c = batch_x.shape
        batch_crops = np.zeros((b, size[0], size[1], c))
        for i in range(b):
            batch_crops[i] = crop(batch_x[i], (size[0], size[1]))
        yield batch_crops, batch_y

def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description='Train and test ResNet50, InceptionV3, or custom model on DeepWeeds.')
    parser.add_argument("command", nargs='?', default='train', help="'cross_validate' or 'inference'")
    parser.add_argument('--model', default='resnet', help="'resnet', 'inception', or path to .hdf5 file.")
    
    import sys
    if any('ipykernel_launcher' in arg for arg in sys.argv) or any('jupyter' in arg for arg in sys.argv):
        args, _ = parser.parse_known_args()
    else:
        args = parser.parse_args()
    
    return args.command, args.model

def cross_validate(model_name):
    """Perform k-fold cross-validation."""
    for k in range(FOLDS):
        timestamp = datetime.fromtimestamp(time()).strftime('%Y%m%d-%H%M%S')
        print(f'Fold {k + 1}/{FOLDS} - {timestamp}')
        output_directory = f"{OUTPUT_DIRECTORY}{timestamp}/"
        os.makedirs(output_directory, exist_ok=True)

        # Load dataframes
        train_label_file = f"{LABEL_DIRECTORY}train_subset{k}.csv"
        val_label_file = f"{LABEL_DIRECTORY}val_subset{k}.csv"
        test_label_file = f"{LABEL_DIRECTORY}test_subset{k}.csv"
        
        if not all(os.path.exists(f) for f in [train_label_file, val_label_file, test_label_file]):
            raise FileNotFoundError(f"Label files for fold {k} are missing.")
        
        train_dataframe = pd.read_csv(train_label_file)
        val_dataframe = pd.read_csv(val_label_file)
        test_dataframe = pd.read_csv(test_label_file)
        train_image_count = train_dataframe.shape[0]
        val_image_count = val_dataframe.shape[0]
        test_image_count = test_dataframe.shape[0]

        # Training image augmentation
        train_data_generator = ImageDataGenerator(
            rescale=1./255,
            fill_mode="constant",
            shear_range=0.2,
            zoom_range=(0.5, 1),
            horizontal_flip=True,
            rotation_range=360,
            channel_shift_range=25,
            brightness_range=(0.75, 1.25))

        # Validation image augmentation
        val_data_generator = ImageDataGenerator(
            rescale=1./255,
            fill_mode="constant",
            shear_range=0.2,
            zoom_range=(0.5, 1),
            horizontal_flip=True,
            rotation_range=360,
            channel_shift_range=25,
            brightness_range=(0.75, 1.25))

        # Test image augmentation
        test_data_generator = ImageDataGenerator(rescale=1./255)

        # Load data generators
        train_data_generator = train_data_generator.flow_from_dataframe(
            train_dataframe,
            IMG_DIRECTORY,
            x_col='Filename',
            y_col='Label',
            target_size=RAW_IMG_SIZE,
            batch_size=BATCH_SIZE,
            has_ext=True,
            classes=CLASSES,
            class_mode='categorical')

        val_data_generator = val_data_generator.flow_from_dataframe(
            val_dataframe,
            IMG_DIRECTORY,
            x_col="Filename",
            y_col="Label",
            target_size=RAW_IMG_SIZE,
            batch_size=BATCH_SIZE,
            has_ext=True,
            classes=CLASSES,
            class_mode='categorical')

        test_data_generator = test_data_generator.flow_from_dataframe(
            test_dataframe,
            IMG_DIRECTORY,
            x_col="Filename",
            y_col="Label",
            target_size=IMG_SIZE,
            batch_size=BATCH_SIZE,
            has_ext=True,
            shuffle=False,
            classes=CLASSES,
            class_mode='categorical')

        # Crop images
        train_data_generator = crop_generator(train_data_generator, IMG_SIZE)
        val_data_generator = crop_generator(val_data_generator, IMG_SIZE)

        # Load pre-trained model
        if model_name == "resnet":
            base_model = ResNet50(weights='imagenet', include_top=False, input_shape=INPUT_SHAPE)
        elif model_name == "inception":
            base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=INPUT_SHAPE)
        else:
            raise ValueError("Model must be 'resnet' or 'inception'.")
        
        x = base_model.output
        x = GlobalAveragePooling2D(name='avg_pool')(x)
        outputs = Dense(len(CLASSES), activation='sigmoid', name='fc9')(x)
        model = Model(inputs=base_model.input, outputs=outputs)

        # Define callbacks
        model_checkpoint = ModelCheckpoint(output_directory + "lastbest-0.hdf5", verbose=1, save_best_only=True)
        early_stopping = EarlyStopping(patience=STOPPING_PATIENCE, restore_best_weights=True)
        tensorboard = TensorBoard(log_dir=output_directory, histogram_freq=0, write_graph=True, write_images=False)
        reduce_lr = ReduceLROnPlateau('val_loss', factor=0.5, patience=LR_PATIENCE, min_lr=0.000003125)
        csv_logger = CSVLogger(output_directory + "training_metrics.csv")
        model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=INITIAL_LR), 
                      metrics=['categorical_accuracy'])

        # Train model
        global_epoch = 0
        restarts = 0
        last_best_losses = []
        last_best_epochs = []
        while global_epoch < MAX_EPOCH:
            history = model.fit(
                train_data_generator,
                steps_per_epoch=train_image_count // BATCH_SIZE,
                epochs=MAX_EPOCH - global_epoch,
                validation_data=val_data_generator,
                validation_steps=val_image_count // BATCH_SIZE,
                callbacks=[tensorboard, model_checkpoint, early_stopping, reduce_lr, csv_logger],
                verbose=1)
            last_best_losses.append(min(history.history['val_loss']))
            last_best_local_epoch = history.history['val_loss'].index(min(history.history['val_loss']))
            last_best_epochs.append(global_epoch + last_best_local_epoch)
            if early_stopping.stopped_epoch == 0:
                print(f"Completed training after {MAX_EPOCH} epochs.")
                break
            else:
                global_epoch = global_epoch + early_stopping.stopped_epoch - STOPPING_PATIENCE + 1
                print(f"Early stopping triggered after local epoch {early_stopping.stopped_epoch} "
                      f"(global epoch {global_epoch}).")
                print(f"Restarting from last best val_loss at local epoch "
                      f"{early_stopping.stopped_epoch - STOPPING_PATIENCE} "
                      f"(global epoch {global_epoch - STOPPING_PATIENCE}).")
                restarts += 1
                model.compile(loss='binary_crossentropy', 
                              optimizer=Adam(learning_rate=INITIAL_LR / 2 ** restarts),
                              metrics=['categorical_accuracy'])
                model_checkpoint = ModelCheckpoint(output_directory + f"lastbest-{restarts}.hdf5",
                                                   monitor='val_loss', verbose=1, save_best_only=True, mode='min')

        # Save last best model info
        with open(output_directory + "last_best_models.csv", 'w', newline='') as file:
            writer = csv.writer(file, delimiter=',')
            writer.writerow(['Model file', 'Global epoch', 'Validation loss'])
            for i in range(restarts + 1):
                writer.writerow([f"lastbest-{i}.hdf5", last_best_epochs[i], last_best_losses[i]])

        # Load the best model
        model = load_model(output_directory + f"lastbest-{last_best_losses.index(min(last_best_losses))}.hdf5")

        # Evaluate model
        predictions = model.predict(test_data_generator, steps=test_image_count // BATCH_SIZE + 1)
        y_true = test_data_generator.classes
        y_pred = np.argmax(predictions, axis=1)
        y_pred[np.max(predictions, axis=1) < 1 / 9] = 8

        # Save classification report and confusion matrix
        print(classification_report(y_true, y_pred, labels=CLASSES, target_names=CLASS_NAMES))
        report = classification_report(y_true, y_pred, labels=CLASSES, target_names=CLASS_NAMES, output_dict=True)
        with open(output_directory + 'classification_report.csv', 'w') as f:
            for key in report.keys():
                f.write(f"{key},{report[key]}\n")
        conf_arr = confusion_matrix(y_true, y_pred, labels=CLASSES)
        print(conf_arr)
        np.savetxt(output_directory + "confusion_matrix.csv", conf_arr, delimiter=",")

        print(f"Finished testing fold {k + 1}\n")
        K.clear_session()

def inference(model):
    """Perform inference on DeepWeeds images."""
    timestamp = datetime.fromtimestamp(time()).strftime('%Y%m%d-%H%M%S')
    output_directory = f"{OUTPUT_DIRECTORY}{timestamp}/"
    os.makedirs(output_directory, exist_ok=True)

    # Load dataframe
    label_file = f"{LABEL_DIRECTORY}labels.csv"
    if not os.path.exists(label_file):
        raise FileNotFoundError(f"Label file {label_file} is missing.")
    
    dataframe = pd.read_csv(label_file)
    image_count = dataframe.shape[0]
    filenames = dataframe.Filename

    preprocessing_times = []
    inference_times = []
    for i in range(image_count):
        try:
            start_time = time()
            img = imread(os.path.join(IMG_DIRECTORY, filenames[i]))
            img = resize(img, (224, 224))
            img = np.expand_dims(img, axis=0)
            img = img * 1./255
            preprocessing_time = time() - start_time
            start_time = time()
            prediction = model.predict(img, batch_size=1, verbose=0)
            y_pred = np.argmax(prediction, axis=1)
            y_pred[np.max(prediction, axis=1) < 1/9] = 8
            inference_time = time() - start_time
            preprocessing_times.append(preprocessing_time)
            inference_times.append(inference_time)
        except Exception as e:
            print(f"Error processing image {filenames[i]}: {e}")
            continue

    # Save inference times
    with open(output_directory + "tf_inference_times.csv", 'w', newline='') as file:
        writer = csv.writer(file, delimiter=',')
        writer.writerow(['Filename', 'Preprocessing time (ms)', 'Inference time (ms)'])
        for i in range(image_count):
            writer.writerow([filenames[i], preprocessing_times[i] * 1000, inference_times[i] * 1000])

if __name__ == '__main__':
    # Parse arguments
    command, model = parse_args()

    # Download images and models
    try:
        download_images()
        download_models()
    except Exception as e:
        print(f"Error in downloading resources: {e}")
        exit(1)

    # Execute command
    if command == "cross_validate":
        if model not in ["resnet", "inception"]:
            print("Error: Model must be 'resnet' or 'inception' for cross_validate command.")
            exit(1)
        cross_validate(model)
    elif command == "inference":
        if not model.endswith(".hdf5"):
            print("Error: You must supply a valid .hdf5 model file for inference (e.g., 'path/to/model.hdf5').")
            exit(1)
        if not os.path.exists(model):
            print(f"Error: Model file '{model}' does not exist.")
            exit(1)
        try:
            model = load_model(model)
            inference(model)
        except Exception as e:
            print(f"Error loading model '{model}': {e}")
            exit(1)
    else:
        print(f"Error: Invalid command '{command}'. Use 'cross_validate' or 'inference'.")
        exit(1)

Downloading DeepWeeds models to ./models/models.zip
Finished downloading models.
Error in downloading resources: Error downloading or unzipping models: ./models/models.zip is not a valid ZIP file. Check the download URL or file integrity.
Error: You must supply a .hdf5 model file for inference.
Error: Model file resnet does not exist.


OSError: No file or directory found at resnet

In [7]:
import os
print(IMG_ZIP_FILE, os.path.exists(IMG_ZIP_FILE), os.path.getsize(IMG_ZIP_FILE))


./images/images.zip True 2422


In [8]:
with open(IMG_ZIP_FILE, 'rb') as f:
    print(f.read(200))  # peek at first 200 bytes




In [9]:
os.remove(IMG_ZIP_FILE)
