In [1]:
'''
In this notebook, we generate features and come up with the models
'''

'\nIn this notebook, we generate features and come up with the models\n'

In [1]:
#for array manipulations
import numpy as np
#for image processing
import cv2 
#for displaying images
import matplotlib.pyplot as plt
#to display images in this notebook, not in a separate window
%matplotlib inline
#to access system resources such as directories
import os
import pandas as pd

In [2]:
#Set this to point to the project root; all paths will be relative to this one
project_dir = '/home/lyle/tutorials/AI/scikit-learn/maize-disease-detection/'

In [4]:
def set_up_directories(project_dir=project_dir):
    """Sets up the paths to important direcoties
    
    Parameters
    ----------
    project_dir : string; default is the current working directory
        The path to the project root i.e '/home/lyle/tutorials/AI/scikit-learn/maize-disease-detection/'
    
    returns
    -------
    base_dir : string
        The project directory path
    data_folder : string
        The data subfolder path
    maize_data_folder : 
        The path to the subdirectory containing the maize images
        
    example usage
    -------------
    base_dir, data_folder, maize_data_folder = set_up_directories()
    """
    
    #set our base directory. This should point to the location of the plant-diseases folder
    base_dir = project_dir
    #set the path to our data folder
    data_folder = os.path.join(base_dir, 'data')
    #set the path to the maize folder and list the various categories available
    maize_data_folder = os.path.join(data_folder, 'maize')

    return base_dir, data_folder, maize_data_folder

def get_32(disease):
    """Loads 32 images for a given maize disease
    
    parameters
    ----------
    disease: string
        A string that could be common_rust, healthy, leaf_spot, nothern_leaf_blight
    returns
    -------
    disease_images: list
        A list of images for the selected disease
    """
    
    #this list will contain the 20 images returned
    disease_images = []
    #path to the images
    disease_images_path = os.path.join(maize_data_folder, disease)
    for image_path in os.listdir(disease_images_path):
        image_path = os.path.join(disease_images_path, image_path)
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        disease_images.append(image)
    return disease_images

#This function will help us plot 10 images
def plot_images(images, title):
    """Plots 10 images of a particular disease category
    
    parameters
    ----------
    images: list
        List of images(each image is an array)
    title: string
        Title for each image i.e name of disease
    """
    
    plt.figure(figsize=(12,8))
    for i in range(10):
        plt.subplot(2,5, i+1)
        plt.imshow(images[i])
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
    plt.show()
    
#This function allows us to resize images
def resize(image, new_size=(600,600)):
    """Resize the given image
    
    parameters
    ----------
    image : numpy array
        The image to be resized
    new_size : tuple
        The new image size
    returns
    -------
    resized_image : numpy arra
        The resized image
    """
    
    resized_image = cv2.resize(image, new_size)
    return resized_image

#This function generates ORB features
def extract_features_orb(image, vector_size=32):
    """Extracts orb features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        orb_decriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        feature_generator = cv2.ORB_create()
        orb_keypoints = feature_generator.detect(image)
        orb_keypoints = orb_keypoints[:32]
        orb_keypoints, orb_descriptors = feature_generator.compute(image, orb_keypoints)
        orb_descriptors = orb_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if orb_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            orb_descriptors = np.concatenate([orb_descriptors, np.zeros(needed_size - orb_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return orb_descriptors

#This function generates KAZE features
def extract_features_kaze(image, vector_size=32):
    """Extracts kaze features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        kaze_descriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        feature_generator = cv2.KAZE_create()
        kaze_keypoints = feature_generator.detect(image)
        kaze_keypoints = kaze_keypoints[:32]
        kaze_keypoints, kaze_descriptors = feature_generator.compute(image, kaze_keypoints)
        kaze_descriptors = kaze_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if kaze_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            kaze_descriptors = np.concatenate([kaze_descriptors, np.zeros(needed_size - kaze_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return kaze_descriptors

def extract_hog_features(image, feature_size=4096):
    """Extracts hog features for the image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    feature_size : int
        The number of features to generate
    returns
    -------
        hog_features : numpy array 
        
    raises
    ------
    cv2.error
    """
    hog = cv2.HOGDescriptor()
    features = hog.compute(common_rust_images[0])
    required_features = features[:feature_size].ravel()
    return required_features

In [6]:
#Directories set up
base_dir, data_folder, maize_data_folder = set_up_directories()

In [8]:
common_rust_images = get_32('common_rust')
healthy_images = get_32('healthy')
leaf_spot_images = get_32('leaf_spot')
nothern_leaf_blight_images = get_32('nothern_leaf_blight')

In [None]:
#In this example, we generate a sample dataset, using the sample kaze features we just generated
#A dataset usually consists of an items features and the corresponding labels
#This will be our feature set
sample_kaze_features =[]
#This will contain our labels
sample_kaze_labels = []
sample_kaze_features.append(common_rust_image_kaze_features)
sample_kaze_labels.append('common_rust')
sample_kaze_features.append(healthy_image_kaze_features)
sample_kaze_labels.append('healthy')
sample_kaze_features.append(leaf_spot_image_kaze_features)
sample_kaze_labels.append('leaf_spot')
sample_kaze_features.append(nothern_leaf_blight_image_kaze_features)
sample_kaze_labels.append('nothern_leaf_blight')
sample_kaze_labels = np.array(sample_kaze_labels)
sample_kaze_features = np.array(sample_kaze_features)
#In our dataset, we have four instances and each instance has 4096 features
sample_kaze_features.shape

In [None]:
#We have four labels, each label just tells us the disease type
sample_kaze_labels.shape

In [None]:
#Here, we create a sample dataset for the model. X and y are the standard variables used
X = sample_kaze_features
y = sample_kaze_labels

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#Scaling is one of the preprocessing steps to enable our models work better
X = StandardScaler().fit_transform(X)
#We split into a train set and a test set
X_train, X_test = train_test_split(X, test_size=0.1, random_state=4)

In [None]:
print(f'Our train data set is of shape {X_train.shape}')
print(f'Our test dataset is of shape {X_test.shape}')

In [None]:
from sklearn.preprocessing import LabelEncoder

#We encode our labels into numerical values since that is the kind of dataset that models work with
y = LabelEncoder().fit_transform(y)
#We then split into a train and test set
y_train, y_test = train_test_split(y, test_size=0.1, random_state=4)

In [None]:
y_train.shape, y_test.shape

In [None]:
y_train

In [None]:
#Now let us perform these steps for all the 32 images loaded
#This will contain all our images
all_images = []
#This will contain all our labels
all_labels = []
labels = ['common_rust', 'healthy', 'leaf_spot', 'nothern_leaf_blight']
for i, image_folder in enumerate([common_rust_images, healthy_images, leaf_spot_images, nothern_leaf_blight_images]):
    for image in image_folder:
        all_images.append(image)
        all_labels.append(labels[i])

In [None]:
len(all_images), len(all_labels)

In [None]:
#Let us extraxt KAZE features
def extract_features():
    features, labels = [], []
    for i, image in enumerate(all_images):
        image_features = extract_features_kaze(image)
        image_label = all_labels[i]
        features.append(image_features)
        labels.append(image_label)
    features = np.array(features)
    labels = np.array(labels)
    features = StandardScaler().fit_transform(features)
    labels = LabelEncoder().fit_transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = extract_features()

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

In [None]:
X_test[0]

In [None]:
X_train[0]

In [None]:
y_train[:10]

In [None]:
y_test[:10]

In [None]:
#Now these two, y_train and X_train can be fed into any classifier