In [None]:
import numpy as np
import os
import re
import json
import torch
import cv2 as cv
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle
import pandas as pd
import seaborn as sn
from torchmetrics import ConfusionMatrix



default_matplotlib_backend = matplotlib.get_backend()
print('imported')
print('default_matplotlib_backend: {}'.format(default_matplotlib_backend))


In [None]:
RUN_MODE = ['DEV','LIVE'][1]

SIMPLE_PATH = True  # False if custom path

In [None]:
# Set seed for reproducibility
np.random.seed(42)

# Check if running on Windows
windows = (os.name == 'nt')

# Define paths to data and directories
if windows or SIMPLE_PATH:
    # Paths for Windows or using root paths
    root_path = os.getcwd()
    extracted_data_path = os.path.join(root_path, "datasets", "")
    weights_path = os.path.join(root_path, "weights")
    stats_path = os.path.join(root_path, "stats")
    npy_data_path = os.path.join(root_path, "npy_data")
    model_save_path = os.path.join(root_path, "model", "model.pth")
else:
    # Paths for Linux or custom paths
    root_path = ''
    extracted_data_path = ''
    weights_path =  ''
    stats_path = ''
    npy_data_path = ''
    model_save_path = ''

# Create directories if they don't exist
for path in [npy_data_path, weights_path, stats_path, os.path.dirname(model_save_path)]:
    os.makedirs(path, exist_ok=True)

# Get list of classes
class_list = os.listdir(extracted_data_path)[:3] if RUN_MODE == "DEV" else os.listdir(extracted_data_path)
class_list.sort()
print('Number of classes: {}'.format(len(class_list)))
print('Classes: {}'.format(class_list))

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Selected device: {}'.format(device))
if device == 'cuda':
    print('Device name: {}'.format(torch.cuda.get_device_name(device)))

# Constants
BATCH_SIZE = 16 if RUN_MODE == "DEV" else 32
EPOCHS = 3 if RUN_MODE == "DEV" else 30
NUM_CHANNELS = 3

## Data loading
1. loading all paths to files with corresponding class    
2. if npy already exists, then load it, otherwise load the dataset in memory  and save it as npy  
3. load npy  
4. split it as train, test, validation sets  

In [None]:
#specify which subdataset to use:
DATASE_GROUP_IDX = None #could be: None, 0, 1, 2
train_valid_test_split_json_name = 'splitNEW8.json'

#availiable subdatasets:
indep1 = [18, 9, 26, 29, 34]
dataset_groups = [
    [1, 7, 10, 14, 11, 32, 13, 4, 8, 17, 20, 25, 28, 31, 36],
    [0, 15, 21, 22, 23, 5],
    [37, 33, 16, 2, 3, 6, 12, 19, 24, 27, 30, 35] + indep1,
]

# getting the list of the classes
class_list = os.listdir(extracted_data_path)
class_list.sort()
print('Number of classes: {}'.format(len(class_list)))
print('Classes: {}'.format(class_list))

print('Loading data ...')
image_files = []
targets = []

# dict helps to go from class_name to class_index
class_dict = dict([(j, i) for i, j in enumerate(class_list)])

# loading all image_paths (IN A SORTED ORDER, this is really important to avoid any weird exceptions)
for class_name in class_dict.keys():
    repetitions_list = os.listdir(extracted_data_path + class_name)
    repetitions_list.sort()
    for repetition in repetitions_list:
        image_list = os.listdir(extracted_data_path + class_name + os.sep + repetition)
        image_list.sort()
        image_files.extend(
            [extracted_data_path + class_name + os.sep + repetition + os.sep + img for img in image_list]
        )
        targets.extend([class_dict[class_name]] * len(image_list))

targets = np.array(targets)


if not os.path.exists(os.path.join(npy_data_path, 'images.npy')):

    # saving the data on .npy files
    # will read each image and resize it to (3, 350, 350) and push it an array
    print('Saving data as .npy files tp {}'.format(npy_data_path))

    # loading everything into memory since we have enough space
    images = np.empty((len(image_files), NUM_CHANNELS, 350, 350), dtype=np.uint8)
    for idx, img_path in enumerate(tqdm(image_files, position=0, leave=True)):
        img = cv.imread(img_path)
        img = cv.cvtColor(img, cv.COLOR_BGR2RGB)
        img = cv.resize(img, (350, 350)) # saving with 350x350 size, downsample in dataloader transform= property if you want

        img = img.transpose(2, 0, 1)
        images[idx] = img

    # saving files for next trainings (saves a lot of time, accessing drive files from colab takes huge time)
    with open(os.path.join(npy_data_path, 'images.npy'), 'wb') as npy_images_file, open(os.path.join(npy_data_path, 'targets.npy'), 'wb') as npy_targets_file:
        np.save(npy_images_file, images, allow_pickle=True)
        np.save(npy_targets_file, targets, allow_pickle=True)
    with open(os.path.join(npy_data_path, 'image_files.json'), 'w') as f:
        json.dump(image_files, f)

    print('Data files saved')
else:
    images = np.load(os.path.join(npy_data_path, 'images.npy'))
    targets = np.load(os.path.join(npy_data_path, 'targets.npy'))

    print('Data files loaded')


"""
apply train/valid/test split from json:
json split is completed by usin following rules:
  1. each class should be split separately
  2. one rep_<> could be only in one dataset part (train valid or test)
  3. to decide to which dataset part each rep_<> should be linked we use following rule:
      3.1 assign probability to each rep_<> by using rep's length (more length - more prob) and define lens for train (70%), valid (10%) test parts (20%). Min len for each dataset part is 1.
      3.2 random select rep_<> from reps by using probs and link selected to test part
      3.3 random select rep_<> from reps by using probs and link selected to vali part
      3.4 random select rep_<> from reps by using probs and link selected to train part
      3.5 repeate 3.2, 3.3, 3.4 in loop untill all dataset parts have enough reps nums
"""

if DATASE_GROUP_IDX is not None:
    dataset_groups = [sorted(el) for el in dataset_groups]
    assert 0 <= DATASE_GROUP_IDX <len(dataset_groups), '...'

    mapper_allcls_to_subcls = {j:i for i,j in enumerate(dataset_groups[DATASE_GROUP_IDX])}
    class_dict = {class_list[j]:i for i,j in enumerate(dataset_groups[DATASE_GROUP_IDX])}
    class_list = [class_list[j] for i, j in enumerate(dataset_groups[DATASE_GROUP_IDX])]

    new_targets = []
    new_image_files = []
    for t,im in zip(targets, image_files):
        if t in mapper_allcls_to_subcls:
            new_targets.append(mapper_allcls_to_subcls[t])
            new_image_files.append(im)
    new_targets = np.array(new_targets)
    targets = new_targets
    image_files = new_image_files


with open(os.path.join(root_path, train_valid_test_split_json_name), 'r') as f:
    image_files_dict_cut = json.load(f)

classes_to_use = sorted(list(image_files_dict_cut.keys()))
mapper_allcls_to_subcls = {class_dict[cls_name]:i for i,cls_name in enumerate(classes_to_use)}
class_dict = {cls_name:i for i,cls_name in enumerate(classes_to_use)}
class_list = [cls_name for i,cls_name in enumerate(classes_to_use)]

new_targets = []
new_image_files = []
for t,im in zip(targets, image_files):
    if t in mapper_allcls_to_subcls:
        new_targets.append(mapper_allcls_to_subcls[t])
        new_image_files.append(im)
new_targets = np.array(new_targets)
targets = new_targets
image_files = new_image_files

    
train_images_idxs, val_images_idxs, test_images_idxs = [], [], []

for image_file_idx, image_file_name in enumerate(image_files):
    image = images[image_file_idx]
    target = targets[image_file_idx]

    _class, rep_num, im_name = image_file_name.split(os.sep)[-3:]
    
    if (_class not in image_files_dict_cut) or (rep_num not in image_files_dict_cut[_class]):
        continue
    if image_files_dict_cut[_class][rep_num] == 'train':
        train_images_idxs.append(image_file_idx)
    elif image_files_dict_cut[_class][rep_num] == 'valid':
        val_images_idxs.append(image_file_idx)
    elif image_files_dict_cut[_class][rep_num] == 'test':
        test_images_idxs.append(image_file_idx)
    else:
        raise

train_images = images[train_images_idxs]
val_images   = images[val_images_idxs]
test_images  = images[test_images_idxs]

train_targets = targets[train_images_idxs]
val_targets   = targets[val_images_idxs]
test_targets  = targets[test_images_idxs]


assert len(train_images) == len(train_targets)
assert len(test_images) == len(test_targets)
assert len(val_images) == len(val_targets)

print('Number of training images: {}'.format(len(train_images)))
print('Number of test images: {}'.format(len(test_images)))
print('Number of validation images: {}'.format(len(val_images)))

In [None]:
to_plot_from_train = 6

plt.figure(figsize=(10, 10))
for i in range(to_plot_from_train):
    plt.subplot(1, to_plot_from_train, i + 1)
    img = train_images[i].swapaxes(0,2)
    plt.imshow((img).astype(np.uint8))
    plt.title('Class: {}'.format(class_list[train_targets[i]]))
    plt.axis('off')

plt.show()

#### Dataset class and data loaders
By loading the whole dataset inside the memory, it because very easy to train the model. All we have to do is normalise the images (divide by 255 and change into C, W, H format) then return it with it's corresponding target class

- Data augmentation can be turned on/off by modifying this code (more details in comments at the end)

In [None]:
from pytorch_utils.data_utils import *

# loading data
test_dataset = ClassificationPlantDataset(test_images, test_targets)

# data loaders
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=1, shuffle=True, drop_last=False)

print('Data loaders created')


## Evaluation functions

In [None]:
def mathews_correlation_coefficient_np(tp, fp, fn, tn, eps=1e-11):
    tp = tp.sum().astype(np.float64)
    tn = tn.sum().astype(np.float64)
    fp = fp.sum().astype(np.float64)
    fn = fn.sum().astype(np.float64)
    _numerator = (tp*tn - fp*fn)
    _denomerator = np.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    x = _numerator / (_denomerator + eps)
    return x

### Eval Loop

In [None]:
def evaluate_model(model, test_loader, verbose=True, eps=1e-10): 
    if verbose:
        print('--------------------------------------------')
        print('Test metrics (on test set)')

    model.eval()

    confusion_matrix = ConfusionMatrix(num_classes=len(class_list))
    eval_preds = list()
    eval_targs = list()

    # computing predictions and confusion matrix
    for i, (images, targets) in enumerate(tqdm(test_loader, position=0, leave=True)):
        images, targets = images.to(device, dtype=torch.float), torch.Tensor(targets).to(device)
        outputs = torch.nn.functional.log_softmax(model(images), dim=1)
        preds = torch.argsort(outputs, dim=1, descending=True)[:, :3]    
        eval_preds.extend(preds[:, 0].cpu().numpy())
        eval_targs.extend(targets.cpu().numpy())

    # computing main metrics (acc, precisio, recall and f1 score)
    matrix = confusion_matrix(torch.tensor(eval_preds), torch.tensor(eval_targs))
    accuracy = matrix.trace() / (matrix.sum() + eps)
    precision = np.array([matrix[i, i] / (matrix.sum(axis=0)[i] + eps) for i in range(len(class_list))])
    recall = np.array([matrix[i, i] / (matrix.sum(axis=1)[i] + eps) for i in range(len(class_list))]) 
    f1_score = 2 * precision * recall / (precision + recall + eps)

    # computing false positive rate, false negative rate, false discovery rate, false omission rate
    fp_rate = np.zeros(len(class_list))
    for idx in range(len(class_list)):
        tn = matrix.trace() - matrix[idx, idx]
        fp = np.sum([matrix[j, idx] for j in range(len(class_list)) if j != idx])
        fp_rate[idx] = fp / (fp + tn + eps)

    fn_rate = 1 - recall 
    fd_rate = 1 - precision
    specificity = 1 - fp_rate

    fo_rate = np.zeros(len(class_list))
    for idx in range(len(class_list)):  
        n = np.sum(np.array(eval_targs) != idx)
        fn = np.sum([matrix.sum(axis=0)[j] - matrix[j, j] for j in range(len(class_list)) if j != idx])
        fo_rate[idx] = fn / (n + eps)

    missclassification_rate = 1 - accuracy
    npv = 1 - fo_rate

    mcc_per_class = []
    for idx in range(len(class_list)):
        tp = matrix[idx, idx].cpu().numpy()
        tn = (matrix.trace() - matrix[idx, idx]).cpu().numpy()
        fp = np.sum([matrix[j, idx] for j in range(len(class_list)) if j != idx])
        fn = np.sum([matrix.sum(axis=0)[j] - matrix[j, j] for j in range(len(class_list)) if j != idx])
        _mcc = mathews_correlation_coefficient_np(tp, fp, fn, tn)
        mcc_per_class.append(_mcc)

    if verbose:
        print('--------------------------------------------')
        print('Accuracy: {:.3f}%'.format(accuracy * 100))
        print('Average precision: {:.3f}'.format(precision.mean()))
        print('Average recall: {:.3f}'.format(recall.mean()))
        print('Average F1 score: {:.3f}'.format(f1_score.mean()))
        print('Average specificity: {:.3f}'.format(specificity.mean()))
        print('Average false positive rate: {:3f}'.format(fp_rate.mean()))
        print('Average false negative rate: {:3f}'.format(fn_rate.mean()))
        print('Average false discovery rate: {:.3f}'.format(fd_rate.mean()))
        print('Average false omission rate: {:.3f}'.format(fo_rate.mean()))
        print('Missclassification rate: {:.2f}%'.format(missclassification_rate * 100))
        print('Mathews Correlation Coefficient: {:.2f}'.format(np.mean(mcc_per_class)))
        print('--------------------------------------------')
        print('Results by class :')    
        print('--------------------------------------------')
        print('{:<15}{:<12}{:<12}{:<12}{:<12}{:<12}{:<12}{:<12}{:<12}{:<12}{:<12}'.format('', 'Precision', 'Recall', 'F1 score', 'Specificity', 'FPR', 'FNR', 'FDR', 'FOR', 'NPV', 'MCC'))
        for idx, class_name in enumerate(class_list):
            print('{:<15}{:<12.2f}{:<12.2f}{:<12.2f}{:<12.3f}{:<12.3f}{:<12.3f}{:<12.3f}{:<12.3f}{:<12.3f}{:<12.3f}'.format(
                class_name, precision[idx], recall[idx], f1_score[idx], specificity[idx], fp_rate[idx], fn_rate[idx], fd_rate[idx], fo_rate[idx], npv[idx], mcc_per_class[idx]
            ))
        print('--------------------------------------------')
        print()

        # ploting confusion matrix
        matrix_df = pd.DataFrame(matrix.numpy(), index=class_list, columns=class_list)
        plt.figure(figsize=(12, 8))
        sn.heatmap(matrix_df, annot=True, fmt='d', cmap='Blues')

    return accuracy, precision, recall, f1_score


## Define functions used to calculate accuracy of any dataset

In [None]:
from pytorch_utils.validating_utils import *

## Convert to Torch Script and save for deployment

In [None]:

ORIGINAL_MODEL_PATH = ""
DEPLOY_TS_MODELS_FOLDER = ""
resized_shape = (350, 350)
##########################################################################################################################################################

if not os.path.exists(DEPLOY_TS_MODELS_FOLDER):
    os.makedirs(DEPLOY_TS_MODELS_FOLDER)

model = torch.load(ORIGINAL_MODEL_PATH)
model.eval()
model = model.to(device)

dummy_input = torch.randn((1, 3, resized_shape[0], resized_shape[1]), device=device)
traced_model = torch.jit.trace(model, dummy_input)

model_name = os.path.basename(ORIGINAL_MODEL_PATH)
model_name = model_name.replace(model_name.split(".")[1], "pt")
model_name = model_name.replace(".pt", "_ts.pt")

print("Model saved in:\n", DEPLOY_TS_MODELS_FOLDER)
print("saved file name:", model_name)

torch.jit.save(traced_model, os.path.join(DEPLOY_TS_MODELS_FOLDER, model_name))

In [None]:
plt.figure(figsize=(28, 14))
for idx, filename in enumerate(os.listdir(stats_path)):
    with open(os.path.join(stats_path, filename), 'rb') as f:
        _, _, _, val_acc = pickle.load(f)
    filename = re.search(r'(?<=stats_).*(?=.pkl)', filename)[0]
    plt.plot(np.arange(len(val_acc[:EPOCHS])), val_acc[:EPOCHS], label=filename) 
plt.legend(loc='lower right')
plt.show()

accuracies, model_names = [], []
for idx, filename in enumerate(os.listdir(weights_path)):
    model = torch.load(os.path.join(weights_path, filename))
    accuracy, _, _, _ = evaluate_model(model, test_loader, verbose=False)
    accuracies.append(accuracy)
    model_names.append(type(model).__name__)

plt.figure(figsize=(20, 10))
plt.bar(model_names, np.array(accuracies) - 0.9, bottom=0.9, width=0.5)
plt.show()