# Plant Disease Detection - Deep Learning Project

## Initial Setup

In [1]:
# Imports
import torch
import torchvision
import matplotlib.pyplot as plt
import os
import requests
import zipfile
import shutil
import random
import importlib
import pandas as pd

from tqdm.auto import tqdm
from torch import nn
from torchvision import transforms
from torchinfo import summary
from torch.utils.tensorboard import SummaryWriter
from typing import Dict, List

import helpers


In [None]:
importlib.reload(helpers.setup)
from helpers.setup import set_device, set_seeds, set_pandas_options, reset_pandas_options

# Set the device
device = set_device()

# Set random seeds
set_seeds(seed=42, device=device)

# Pandas display setup to avoid truncation
set_pandas_options()
reset_pandas_options()

## Data Acquisition

In [5]:
importlib.reload(helpers.acquisition)
from helpers.acquisition import create_dataset

url = "https://data.mendeley.com/public-files/datasets/tywbtsjrjv/files/d5652a28-c1d8-4b76-97f3-72fb80f94efc/file_downloaded"
data_dir = "data"
zip_path = os.path.join(data_dir, "data_compressed.zip")
extracted_dir = "Plant_leave_diseases_dataset_without_augmentation"
splits = {
    'data_10_10_80': [0.1, 0.1, 0.8], #train 10% test 10% holdout 80%
    'data_20_10_70': [0.2, 0.1, 0.7],
    'data_30_10_60': [0.3, 0.1, 0.6],
    'data_70_20_10': [0.7, 0.2, 0.1]
}

create_dataset(url, zip_path, splits, data_dir, extracted_dir)

Data folder already exists. Skipping download...
Nothing to extract. Skipping extraction...
Directory data_10_10_80 already exists. Skipping split...
Directory data_20_10_70 already exists. Skipping split...
Directory data_30_10_60 already exists. Skipping split...
Directory data_70_20_10 already exists. Skipping split...
Dataset splits generated successfully!


## Data Exploration

In [19]:
importlib.reload(helpers.exploration)
from helpers.exploration import traverse_dirs

data_dir_names = list(splits.keys())
classes_df = traverse_dirs(data_dir_names)
classes_df = classes_df.groupby(["dataset", "split_type", "class"]).sum()
classes_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,image_count
dataset,split_type,class,Unnamed: 3_level_1
data_10_10_80,holdout,Apple___Apple_scab,5
data_10_10_80,holdout,Apple___Black_rot,5
data_10_10_80,holdout,Apple___Cedar_apple_rust,5
data_10_10_80,holdout,Apple___healthy,5
data_10_10_80,holdout,Background_without_leaves,5
...,...,...,...
data_70_20_10,train,Tomato___Spider_mites Two-spotted_spider_mite,5
data_70_20_10,train,Tomato___Target_Spot,5
data_70_20_10,train,Tomato___Tomato_Yellow_Leaf_Curl_Virus,5
data_70_20_10,train,Tomato___Tomato_mosaic_virus,5


## Data Preparation

In [8]:
# GAN and NoGAN

# Data Augmentation

# Different data splits 
    # 10% train 10% test 80% holdout (GAN and NoGAN)
    # 20% train 10% test 70% holdout (GAN and NoGAN)
    # 30% train 10% test 60% holdout (GAN and NoGAN)
    # 70% train 20% test 10% holdout (GAN and NoGAN) - To be used only after experiment tracking

# Create Dataloaders for each split

## Modeling

In [9]:
# Getting Pre-trained Models
    # EfficientNetB0 - as the highest performing overall in Hassan et al. 99.56%
    # ViT - transformer based approach

# Create the architecture for my own model
    # Combine EfficientNetB0 with Attention blocks like in Thakur et al. (2022)

## Model Training and Experiment Tracking

In [10]:
# Setup Summary Writer

# Train Model while recording experiment run and saving models
    # different epochs, learning rates and dropout rates

# Display tensorboard and select highest performing model to train further

    #### MAYBE EXPLORE ML FLOW if time allows

## Model Selection and Improvement

In [11]:
# Load the most promising model 

# Train it on 70% train 20% test 10% holdout (either GAN or no GAN depending on which performed better)

# Calculate accuracy, precision, recall for the model (+ confusion matrix?)

# Save the model

## Make Predictions

In [12]:
# Predict on test set and display

# Predict on holdout set and display

# THE END