# Imports

In [93]:
# Files
import os

# Linear Algebra
import numpy as np

# Multi thread & processing
import multiprocessing

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning general and Data Modelling
import pandas as pd
import sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder

# Image processing
import cv2

# Evaluation
from sklearn.metrics import accuracy_score

In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prepare Dataset

In [66]:
def get_list_of_files(directory):
  allFiles = []
  for root, dirs, files in os.walk(directory):
    for file in files:
        fullPath = os.path.join(root, file)
        allFiles.append(fullPath)
  return allFiles

def load_image_with_path(image_path):
    img = cv2.imread(image_path)
    img = cv2.resize(img, (150, 150), interpolation=cv2.INTER_AREA)
    label = os.path.basename(os.path.dirname(image_path))
    return img, label, image_path

def prepare_data(dataset, num_workers=2):
    with multiprocessing.Pool(num_workers) as pool:
        results = pool.map(load_image_with_path, dataset)

    data = [res[0] for res in results]
    labels = [res[1] for res in results]
    paths = [res[2] for res in results]

    return np.array(data), np.array(labels), paths

In [67]:
train_ds = get_list_of_files("/content/drive/MyDrive/Research Leaf/ASHIIQ/compute_ashiq/Train")
test_ds = get_list_of_files("/content/drive/MyDrive/Research Leaf/ASHIIQ/compute_ashiq/Test")

In [68]:
X_train, y_train, train_paths = prepare_data(train_ds, 2)
X_test, y_test, test_paths = prepare_data(test_ds, 2)

# Image Processing

## Remove Background

In [69]:
def remove_background_binary(image):
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    saturation = hsv_image[:, :, 1]
    _, binary_mask = cv2.threshold(saturation, 85, 255, cv2.THRESH_BINARY)
    masked_image = cv2.bitwise_and(image, image, mask=binary_mask)
    return masked_image

## Find Optimal Cluster

In [70]:
def find_optimal_k(gaps, s_k, max_k):
    for k in range(1, max_k):
        if gaps[k - 1] >= gaps[k] - s_k[k]:
            return k
    return max_k

def gap_statistic_for_k(pixel_values, k, n_refs=10, random_state=42):
    kmeans = KMeans(n_clusters=k, random_state=random_state, n_init='auto')
    kmeans.fit(pixel_values)

    disp = np.sum(np.min(pairwise_distances(pixel_values, kmeans.cluster_centers_, metric='euclidean')**2, axis=1))

    ref_disps = np.zeros(n_refs)
    for i in range(n_refs):
        random_reference = np.random.uniform(
            low=pixel_values.min(axis=0),
            high=pixel_values.max(axis=0),
            size=pixel_values.shape
        )
        kmeans.fit(random_reference)
        ref_disps[i] = np.sum(np.min(pairwise_distances(random_reference, kmeans.cluster_centers_, metric='euclidean')**2, axis=1))

    log_ref_disp = np.log(ref_disps + 1e-10)
    gap = np.mean(log_ref_disp) - np.log(disp + 1e-10)
    s_k = np.sqrt(np.sum((log_ref_disp - np.mean(log_ref_disp))**2) / n_refs)
    return gap, s_k

def calculate_gap_statistic(image, max_k=10, n_refs=10, random_state=42):
    pixel_values = np.ascontiguousarray(image.reshape((-1, 3)))
    gaps = np.zeros(max_k)
    s_k = np.zeros(max_k)

    for k in range(1, max_k + 1):
        gap, sk = gap_statistic_for_k(pixel_values, k, n_refs=n_refs, random_state=random_state)
        gaps[k - 1] = gap
        s_k[k - 1] = sk

    return gaps, s_k

# Image Segmentation

In [71]:
def k_mean_segmentation(disease, attempts=10, K=2):
    disease = remove_background_binary(disease)
    leaf_image = disease.reshape((-1, 3)).astype(np.float32)
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    _, labels, centers = cv2.kmeans(leaf_image, K, None, criteria, attempts, cv2.KMEANS_PP_CENTERS)
    centers = np.uint8(centers)
    segmented_image = centers[labels.flatten()].reshape(disease.shape)
    return segmented_image, labels, centers

def segment_image(image, K=2):
    return k_mean_segmentation(image, K=K)

def process_image(args):
    image, max_k = args
    # gaps, s_k = calculate_gap_statistic(image, max_k=max_k)
    # optimal_k = find_optimal_k(gaps, s_k, max_k)
    return k_mean_segmentation(image, K=8)

def parallel_segmentation(images, max_k=10, num_workers=2):
    with multiprocessing.get_context("fork").Pool(processes=num_workers) as pool:
        args = [(image, max_k) for image in images]
        results = pool.map(process_image, args)
    return results

---

# Divider

---

In [72]:
train_segmented_images = parallel_segmentation(X_train, max_k=8, num_workers=2)

In [73]:
test_segmented_images = parallel_segmentation(X_test, max_k=8, num_workers=2)

In [74]:
def extract_features(segmented_results, labels, centers):
    """
    Extract mean HSV features from segmented images and flatten them.

    Parameters:
    - segmented_results: List of segmented images (output from k_mean_segmentation).
    - labels: List of label arrays corresponding to each image.
    - centers: List of cluster center arrays corresponding to each image.

    Returns:
    - features: Flattened HSV features for all images.
    """
    features = []

    for segmented_image, label_array, center_array in zip(segmented_results, labels, centers):
        hsv_image = cv2.cvtColor(segmented_image, cv2.COLOR_BGR2HSV)

        image_features = []
        num_clusters = len(center_array)

        for cluster_id in range(num_clusters):
            cluster_mask = (label_array.reshape(segmented_image.shape[:2]) == cluster_id)

            cluster_pixels = hsv_image[cluster_mask]

            if cluster_pixels.size > 0:
                mean_hsv = cluster_pixels.mean(axis=0)
            else:
                mean_hsv = [0, 0, 0]

            image_features.extend(mean_hsv)

        features.append(image_features)

    return np.array(features)

# Testing Divider

In [75]:
train_segmented = [segmented_image for segmented_image, _, _ in train_segmented_images]
train_labels = [label for _, label, _ in train_segmented_images]
train_centers = [center for _, _, center in train_segmented_images]

In [76]:
test_segmented = [segmented_image for segmented_image, _, _ in test_segmented_images]
test_labels = [label for _, label, _ in test_segmented_images]
test_centers = [center for _, _, center in test_segmented_images]

In [77]:
X_traine = extract_features(train_segmented, train_labels, train_centers)
X_teste = extract_features(test_segmented, test_labels, test_centers)

In [78]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [79]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [80]:
knn_model = KNeighborsClassifier(n_neighbors=3)

In [81]:
svm_model = SVC(kernel='linear', C=1.0)

In [82]:
nb_model = GaussianNB()

In [83]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [84]:
dt_model = DecisionTreeClassifier(random_state=42)

In [85]:
knn_model.fit(X_traine, y_train_encoded)

In [86]:
svm_model.fit(X_traine, y_train_encoded)

In [87]:
nb_model.fit(X_traine, y_train_encoded)

In [88]:
rf_model.fit(X_traine, y_train_encoded)

In [89]:
dt_model.fit(X_traine, y_train_encoded)

# Evaluation

In [98]:
y_pred_encoded = knn_model.predict(X_teste)

y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"KNN Accuracy: {accuracy * 100:.2f}%")

KNN Accuracy: 56.90%


In [94]:
y_pred_encoded = svm_model.predict(X_teste)

y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"SVM Accuracy: {accuracy * 100:.2f}%")

SVM Accuracy: 51.72%


In [95]:
y_pred_encoded = nb_model.predict(X_teste)

y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")

Naive Bayes Accuracy: 55.17%


In [96]:
y_pred_encoded = rf_model.predict(X_teste)

y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Random Forest Accuracy: {accuracy * 100:.2f}%")

Random Forest Accuracy: 74.71%


In [97]:
y_pred_encoded = dt_model.predict(X_teste)

y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"Decision Tree Accuracy: {accuracy * 100:.2f}%")

Decision Tree Accuracy: 61.49%
