In [None]:
'''
'classifyFolder_wVectors_v4.ipynb' script

Script for generating classifying feature vectors for each image patch pre-saved in a folder with 'extractingBB_labelledWSI_v3.py'.

Documentation:
This code mostly follows "miso.inference.classify import classify_folder as miso_classify_folder."
#for model_vector
https://github.com/microfossil/particle-classification/blob/master/miso/deploy/saving.py  
#for KNN
https://github.com/microfossil/particle-classification/blob/master/miso/inference/classify.py
#for generating mislabeled folder and TSNE with feature vectors
https://github.com/microfossil/particle-classification/blob/master/miso/training/trainer.py
#for extracting vector from compiled tf Keras model
https://github.com/microfossil/particle-classification/blob/master/miso/models/factory.py 
#for saving mislabeled images (uses feature vectors)
https://github.com/microfossil/particle-classification/blob/master/miso/stats/mislabelling.py

Created: 31-Jan-2025, Marco Acevedo
Updated: 4-Mar-2024, 7-Mar, M.A.

Notes:
Working environment: .conda (Python 3.9.20)
The requirements are listed in 'requirements.txt' ('requirements_base.txt' was for my whole operative system)

To activate conda environment (Command Prompt only):
check that it exist:        conda info --envs
activate:     conda activate "e:\Feb-March_2024_zircon imaging\ParticleTrieur_trial_cl_zircon_annotated\.conda"

'''

"\n'classifyFolder_wVectors_v4.ipynb' script\n\nScript following from miso.inference.classify import classify_folder as miso_classify_folder.\nIt also allows generating the feature vectors for each image.\n\nDocumentation:\n#for model_vector\nhttps://github.com/microfossil/particle-classification/blob/master/miso/deploy/saving.py  \n#for KNN\nhttps://github.com/microfossil/particle-classification/blob/master/miso/inference/classify.py\n#for generating mislabeled folder and TSNE with feature vectors\nhttps://github.com/microfossil/particle-classification/blob/master/miso/training/trainer.py\n#for extracting vector from compiled tf Keras model\nhttps://github.com/microfossil/particle-classification/blob/master/miso/models/factory.py \n#for saving mislabeled images (uses feature vectors)\nhttps://github.com/microfossil/particle-classification/blob/master/miso/stats/mislabelling.py\n\nCreated: 31-Jan-2025, M.A.\nUpdated: 4-Mar-2024, 7-Mar, M.A.\n\n"

In [1]:
import os
import random
from pathlib import Path

import cv2
import matplotlib.pyplot as plt
import numpy as np
import onnx
import pandas as pd
import skimage.io
from PIL import Image
from tqdm import tqdm

from miso.models.factory import *
from miso.data.image_utils import load_image
from miso.deploy.model_info import ModelInfo
from miso.deploy.saving import load_frozen_model_tf2, load_from_xml, load_onnx_from_xml
import tensorflow as tf

import json
from munch import DefaultMunch
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils.extmath import weighted_mode
from sklearn.preprocessing import normalize

import lxml.etree as ET

#Helper functions 

def get_image_paths_and_samples(base_path, sample_name="unknown"):
    #Note 1: remaining issue with filename sorting (not numerical), probably needs grain number with leading zeros
    #Note 2: the input patches might not begin at 1 (depends on QuPath irregular segmentation) but 2 (which fixes Note 1 issue)

    base_path = Path(base_path)
    image_paths = []
    samples = []
    for subdir in base_path.iterdir():
        if subdir.is_dir():
            for file in subdir.rglob('*'):
                if file.suffix.lower() in ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.tif'):
                    image_paths.append(str(file))
                    samples.append(subdir.name)
        else:
            file = subdir
            if file.suffix.lower() in ('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.tif'):
                image_paths.append(str(file))
                samples.append(sample_name)
    return image_paths, samples

# Image loading functions
def load_and_preprocess_image(image_path):
    def _load_image(image_path):
        image_path = image_path.numpy().decode('utf-8')
        image = load_image(image_path, img_size, img_type)
        image = tf.convert_to_tensor(image, dtype=tf.float32)
        return image
    image = tf.py_function(_load_image, [image_path], tf.float32)
    image.set_shape(img_size)
    return image


User input

In [2]:
#images_path = r'D:\Charlotte_spot proj\qupath_segmentation\CL_7-Apr-25\CL'
#images_path = r'D:\Justin Freeman collab\25-mar-2025_Apreo2\CA24MR-1_Redo_stitched and unstitched tiles\montages_final\segmentation\CL_28-Mar-25_row1\row1_stack_CL_greyscale'
images_path = r'C:\Users\acevedoz\OneDrive - Queensland University of Technology\Desktop\Ruby Creek_db_imagery\GA6141_Cross_Mole granite\registered\segmentation\RL_GA6141 200701550_1\CL_GA6141 200701550'

model_info_path = r'E:\Feb-March_2024_zircon imaging\ParticleTrieur_trial_cl_zircon_annotated\trained model_8-Oct-24\ResNet50_cyclic_test7_20241008-135142\model_tf2\network_info.xml'
sample_name = "CA24MR-1_first"
num_neighbours = 11 # K-NN search, default= 11

#images
c = os.path.basename(images_path)
images_path_dir = os.path.join(images_path, 'PT_patches')
output_path = os.path.join(images_path_dir, c + '_vector.csv') #feature vectors
output_path2 = os.path.join(images_path_dir, c + '_vector-knn.csv') #knn pythonic idx

#model
batch_size = 32 #32
unsure_threshold = 0.0
a = os.path.dirname(model_info_path)
b = os.path.dirname(a)    
file_tp = os.path.join(b, r"training_parameters.json")

Load models

In [3]:
#follows miso.inference.classify
model, img_size, img_type, labels = load_from_xml(model_info_path) 
#frozen tf model ('WrappedFunction' object)

##pre-check the right layer is chosen
#list_layers = model.graph.get_operations()
#test = list_layers[-14:] #-12:
#test1 = [op.values() for op in test]
#print("\n".join(map(str, test1)))

#follows miso.deploy.saving
filename = model_info_path
project = ET.parse(filename).getroot()
protobuf = project.find('protobuf').text
print(f"Loading model from {filename}")
print("- protobuf: " + protobuf)

full_protobuf_path = os.path.join(os.path.dirname(filename), protobuf)
list_xml = project.find('inputs')
for i, entry_xml in enumerate(list_xml.iter('input')):
    if i == 0:
        input_name = entry_xml.find('operation').text + ":0"

vector_model = load_frozen_model_tf2(full_protobuf_path, input_name, 'model_2/dense/Relu:0')

Loading model from E:\Feb-March_2024_zircon imaging\ParticleTrieur_trial_cl_zircon_annotated\trained model_8-Oct-24\ResNet50_cyclic_test7_20241008-135142\model_tf2\network_info.xml
- protobuf: frozen_model.pb
- labels:
  - blank-blank
  - blank-oscillatory
  - metamict
  - oscillatory-oscillatory
  - oscillatory-sector op
  - sector-oscillatory
  - sector-oscillatory sector op
  - sector-sector
  - stripy-stripy
  - wavy-wavy
- input: x:0
  - height: 224
  - width: 224
  - channels: 3
- output: Identity:0
Loading model from E:\Feb-March_2024_zircon imaging\ParticleTrieur_trial_cl_zircon_annotated\trained model_8-Oct-24\ResNet50_cyclic_test7_20241008-135142\model_tf2\network_info.xml
- protobuf: frozen_model.pb


Classify folder: producing feature vectors

In [4]:
#follows particle-classification/miso/inference/classify.py > classify_folder

# Create a dataset of image paths
image_paths, sample_names = get_image_paths_and_samples(images_path, sample_name)

image_dataset = tf.data.Dataset.from_tensor_slices(image_paths)
image_dataset = image_dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
image_dataset = image_dataset.batch(batch_size)

# Run predictions
vector_list = []
predictions = []
idxs = []
cls = []
scores = []
for batch in tqdm(image_dataset):
   
    #Predict class
    preds = model(batch).numpy() #frozen model

    #Calculate feature vector
    batch_vectors = vector_model(batch).numpy() #not fine-tuned (issue)    

    batch_idxs = np.argmax(preds, axis=1)
    batch_labels = [labels[idx] for idx in batch_idxs]
    batch_scores = np.max(preds, axis=1)
    
    #Append
    vector_list.extend(batch_vectors)
    predictions.extend(preds)
    idxs.extend(batch_idxs)
    cls.extend(batch_labels)
    scores.extend(batch_scores)

idxs = [idx if score > unsure_threshold else -1 for idx, score in zip(idxs, scores)]
cls = [cls if score > unsure_threshold else "unsure" for cls, score in zip(cls, scores)]

df = pd.DataFrame({
    "filename": image_paths,
    "short_filename": [Path(f).relative_to(images_path) for f in image_paths],
    "sample": sample_names,
    "class_index": idxs,
    "class": cls,
    "score": scores
})

#follows particle-classification.miso.stats.mislabelling.py > find_and_save_mislabelled
vector_array = np.array(vector_list)
vectors_n = normalize(vector_array, axis=1) #default=’l2’ Euclidean

#Generating new table
new_df = pd.DataFrame(vectors_n, 
                      columns=['vector_'+str(i) for i in range(vectors_n.shape[1])])
df2 = pd.concat([df, new_df], axis=1)


df2.to_csv(output_path, index=False)

100%|██████████| 8/8 [00:12<00:00,  1.55s/it]


K-Nearest Neighbors using vectors (for sorting)

In [5]:
# cls_labels = df2["class"].to_list() #label names
cls = df2["class_index"].to_numpy() #label numbers
image_names = df2["filename"].to_list()

# Fit k-Nearest neighbours
neigh = KNeighborsClassifier(n_neighbors=num_neighbours + 1, algorithm='brute')
neigh.fit(vectors_n, cls)

# Find k nearest vectors
num_neighbours_sort = len(cls)
knn = neigh.kneighbors(vectors_n, n_neighbors=num_neighbours_sort)
distance = knn[0][:, 1:] #n_images x num_neighbours - 1 (ascending order)
idx_knn = knn[1][:, 1:] #pythonic idx (same order as input dataframe; warning: filename sorting is not numerical)
# The first will be the same vector as passed in so ignore it

# Get the predicted class from knn (ignoring the same vector)
knn_cls = cls[idx_knn]
wm = weighted_mode(knn_cls, np.max(distance) - distance, axis=1)
pred_cls = wm[0].astype(int).flatten() #label number (from Array of modal values)
pred_w = wm[1].flatten()  #Array of weighted counts for each mode. default=not flatten

#Generating output table 
dict_temp = {'pred_cls': pred_cls, 
             'pred_w': pred_w}
new_df2 = pd.DataFrame(data= dict_temp)

new_df3 = pd.DataFrame(idx_knn, 
                      columns=['idx_knn_'+str(i) for i in range(idx_knn.shape[1])])

df3 = pd.concat([df2, new_df2, new_df3], axis=1)
df3.to_csv(output_path2, index=False)

output_path2 #to search feature vectors with 'prototype_two_v9.m' script


'C:\\Users\\acevedoz\\OneDrive - Queensland University of Technology\\Desktop\\Ruby Creek_db_imagery\\GA6141_Cross_Mole granite\\registered\\segmentation\\RL_GA6141 200701550_1\\CL_GA6141 200701550\\PT_patches\\CL_GA6141 200701550_vector-knn.csv'

In [14]:
image_paths

['D:\\Justin Freeman collab\\25-mar-2025_Apreo2\\CA24MR-1_Redo_stitched and unstitched tiles\\montages_final\\segmentation\\CL_28-Mar-25_row1\\row1_stack_CL_greyscale\\PT_patches\\row1_stack-0001_null_100_[x=5046,y=3006,w=582,h=356].jpg',
 'D:\\Justin Freeman collab\\25-mar-2025_Apreo2\\CA24MR-1_Redo_stitched and unstitched tiles\\montages_final\\segmentation\\CL_28-Mar-25_row1\\row1_stack_CL_greyscale\\PT_patches\\row1_stack-0001_null_101_[x=53852,y=3016,w=276,h=436].jpg',
 'D:\\Justin Freeman collab\\25-mar-2025_Apreo2\\CA24MR-1_Redo_stitched and unstitched tiles\\montages_final\\segmentation\\CL_28-Mar-25_row1\\row1_stack_CL_greyscale\\PT_patches\\row1_stack-0001_null_102_[x=39024,y=3028,w=796,h=478].jpg',
 'D:\\Justin Freeman collab\\25-mar-2025_Apreo2\\CA24MR-1_Redo_stitched and unstitched tiles\\montages_final\\segmentation\\CL_28-Mar-25_row1\\row1_stack_CL_greyscale\\PT_patches\\row1_stack-0001_null_103_[x=53300,y=3054,w=388,h=352].jpg',
 'D:\\Justin Freeman collab\\25-mar-2025_