In [1]:
#These are imports for models and utilities for working with models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
#for array manipulations
import numpy as np
import pandas as pd
#for image processing
import cv2 
#for displaying images
import matplotlib.pyplot as plt
#to display images in this notebook, not in a separate window
%matplotlib inline
#to access system resources such as directories
import os

In [3]:
#Set this to point to the project root; all paths will be relative to this one
project_dir = '/home/lyle/notebooks/maize-disease-detection/'

In [5]:
def set_up_directories(project_dir=project_dir):
    """Sets up the paths to important direcoties
    
    Parameters
    ----------
    project_dir : string; default is the current working directory
        The path to the project root i.e '/home/lyle/tutorials/AI/scikit-learn/maize-disease-detection/'
    
    returns
    -------
    base_dir : string
        The project directory path
    data_folder : string
        The data subfolder path
    maize_data_folder : 
        The path to the subdirectory containing the maize images
        
    example usage
    -------------
    base_dir, data_folder, maize_data_folder = set_up_directories()
    """
    
    #set our base directory. This should point to the location of the plant-diseases folder
    base_dir = project_dir
    #set the path to our data folder
    data_folder = os.path.join(base_dir, 'data')
    #set the path to the maize folder and list the various categories available
    maize_data_folder = os.path.join(data_folder, 'maize')

    return base_dir, data_folder, maize_data_folder

def get_32(disease):
    """Loads 32 images for a given maize disease
    
    parameters
    ----------
    disease: string
        A string that could be common_rust, healthy, leaf_spot, nothern_leaf_blight
    returns
    -------
    disease_images: list
        A list of images for the selected disease
    """
    
    #this list will contain the 20 images returned
    disease_images = []
    #path to the images
    disease_images_path = os.path.join(maize_data_folder, disease)
    for image_path in os.listdir(disease_images_path):
        image_path = os.path.join(disease_images_path, image_path)
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        disease_images.append(image)
    return disease_images

#This function will help us plot 10 images
def plot_images(images, title):
    """Plots 10 images of a particular disease category
    
    parameters
    ----------
    images: list
        List of images(each image is an array)
    title: string
        Title for each image i.e name of disease
    """
    
    plt.figure(figsize=(12,8))
    for i in range(10):
        plt.subplot(2,5, i+1)
        plt.imshow(images[i])
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
    plt.show()
    
#This function allows us to resize images
def resize(image, new_size=(600,600)):
    """Resize the given image
    
    parameters
    ----------
    image : numpy array
        The image to be resized
    new_size : tuple
        The new image size
    returns
    -------
    resized_image : numpy arra
        The resized image
    """
    
    resized_image = cv2.resize(image, new_size)
    return resized_image

#This function generates ORB features
def extract_features_orb(image, vector_size=32):
    """Extracts orb features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        orb_decriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        feature_generator = cv2.ORB_create()
        orb_keypoints = feature_generator.detect(image)
        orb_keypoints = orb_keypoints[:32]
        orb_keypoints, orb_descriptors = feature_generator.compute(image, orb_keypoints)
        orb_descriptors = orb_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if orb_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            orb_descriptors = np.concatenate([orb_descriptors, np.zeros(needed_size - orb_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return orb_descriptors

#This function generates KAZE features
def extract_features_kaze(image, vector_size=32):
    """Extracts kaze features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        kaze_descriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        feature_generator = cv2.KAZE_create()
        kaze_keypoints = feature_generator.detect(image)
        kaze_keypoints = kaze_keypoints[:32]
        kaze_keypoints, kaze_descriptors = feature_generator.compute(image, kaze_keypoints)
        kaze_descriptors = kaze_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if kaze_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            kaze_descriptors = np.concatenate([kaze_descriptors, np.zeros(needed_size - kaze_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return kaze_descriptors

def extract_features_hog(image, feature_size=4096):
    """Extracts hog features for the image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    feature_size : int
        The number of features to generate
    returns
    -------
        hog_features : numpy array 
        
    raises
    ------
    cv2.error
    """
    hog = cv2.HOGDescriptor()
    features = hog.compute(image)
    required_features = features[:feature_size].ravel()
    return required_features

#This function generates SIFT features
def extract_features_sift(image, vector_size=32):
    """Extracts sift features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        sift_descriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        feature_generator = cv2.SIFT_create()
        sift_keypoints = feature_generator.detect(image)
        sift_keypoints = sift_keypoints[:32]
        sift_keypoints, sift_descriptors = feature_generator.compute(image, sift_keypoints)
        sift_descriptors = sift_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if sift_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            sift_descriptors = np.concatenate([sift_descriptors, np.zeros(needed_size - sift_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return sift_descriptors

#This function generates SURF features
def extract_features_surf(image, vector_size=32):
    """Extracts surf features for the given image
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    returns
    -------
        surf_descriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        # Initiate FAST detector
        star = cv2.xfeatures2d.StarDetector_create()
        surf_keypoints = feature_generator.detect(image)
        surf_keypoints = surf_keypoints[:32]
        surf_keypoints, surf_descriptors = feature_generator.compute(image, surf_keypoints)
        surf_descriptors = surf_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if surf_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            sift_descriptors = np.concatenate([surf_descriptors, np.zeros(needed_size - surf_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return surf_descriptors

#This function generates BRIEF features
def extract_features_brief(image, vector_size=32, algorithm="star"):
    """Extracts features for the given image using BRIEF
    
    parameters
    ----------
    image : numpy array
        The image whose features are to be extracted
    vector_size : int
        The number of keypoints to use
    algorithm : string
        The algorithm to use; can be star or fast
    returns
    -------
        brief_descriptors : 
        
    raises
    ------
    cv2.error
    """
    try:
        alg = cv2.xfeatures2d.StarDetector_create()
        if algorithm == "fast":
            alg = cv2.FastFeatureDetector_create()
        brief = cv2.xfeatures2d.BriefDescriptorExtractor_create()
        
        kp = alg.detect(image, None)
        kp = kp[:32]
        kp, des = brief.compute(image, kp)
        des = des.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if des.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            des = np.concatenate([des, np.zeros(needed_size - des.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return np.array([])
    except AttributeError:
        return np.array([])
    return des

In [6]:
#Directories set up
base_dir, data_folder, maize_data_folder = set_up_directories()

In [30]:
def load_all_images():
    """Loads all the images in our maize data folder
    
    returns
    -------
    all_images : list
        List of images
    all_labels : list
        List of corresponding image labels
    """
    
    all_images = []
    all_labels = []
    
    labels = ['common_rust', 'healthy', 'leaf_spot', 'nothern_leaf_blight']
    common_rust_images = get_32('common_rust')
    healthy_images = get_32('healthy')
    leaf_spot_images = get_32('leaf_spot')
    nothern_leaf_blight_images = get_32('nothern_leaf_blight')
    
    for i, image_folder in enumerate([common_rust_images, healthy_images, leaf_spot_images, nothern_leaf_blight_images]):
        for image in image_folder:
            all_images.append(image)
            all_labels.append(labels[i])
            
    return all_images, all_labels

#Function to extract features using a particular algorithm
def extract_features(algorithm=0):
    """Extract features for all images in our dataset
    
    parameters
    ----------
    algorithm : int
        An integer that could be 0, 1, 2, 3, 4
        0 for KAZE
        1 for ORB
        2 for HOG
        3 for SIFT
        4 for BRIEF
    return
    ------
    X_train : numpy array
        An array of shape (n, 4096) containing the features used in training 
    X_test : numpy array
        An array of shape (m, 4096) containing the features used for testing
    y_train : numpy array
        An array of labels for the trainig set
    y_test : numpy array
        An array of labels for the test set
    """
    
    all_images, all_labels = load_all_images()
    
    features, labels = [], []
    
    for i, image in enumerate(all_images):
        image_features = []
        try:
            if algorithm == 0:
                image_features = extract_features_kaze(image)
            elif algorithm == 1:
                image_features = extract_features_orb(image)
            elif algorithm == 2:
                image_features = extract_features_hog(image)
            elif algorithm == 3:
                image_features = extract_features_sift(image)
            elif algorithm == 4:
                image_features = extract_features_brief(image)
            if image_features.shape[0]:
                image_label = all_labels[i]
                features.append(image_features)
                labels.append(image_label)
        except AttributeError as e:
            print(e)
    features = np.array(features)
    labels = np.array(labels)
    features = StandardScaler().fit_transform(features)
    labels = LabelEncoder().fit_transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = extract_features()

In [9]:
X_train.shape, y_train.shape

((89, 4096), (89,))

In [10]:
X_test.shape, y_test.shape

((39, 4096), (39,))

In [11]:
models = [
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(solver='lbfgs', multi_class='auto'),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    BernoulliNB(),
    GaussianNB(),
    KNeighborsClassifier(),
    MLPClassifier(),
    LinearSVC(),
    SVC(gamma='scale')
]
names = [
    'Random Forest',
    'Logistic Regression',
    'AdaBoost',
    'Bagging',
    'Gradient Boosting',
    'Bernoulli NB',
    'Gaussian NB',
    'K-Nearest Neighbors',
    'Neural Network',
    'Linear SVC',
    'Support Vector Machine'
]

def train_base_models(X_train, y_train, X_test, y_test):
    """Used to train the above given list of models
    
    parameters
    ----------
    X_train : numpy array
        The training set values
    y_train : numpy array
        The training set labels
    X_test : numpy array
        The testing set values
    y_test : numpy array
        The testing set labels
    returns
    -------
    df : Pandas DataFrame
        A DataFrame showing model accuracy
    """
    
    model_accuracy = []
    model_names = []
    
    for i, classifier in enumerate(models):
        try:
            classifier.fit(X_train, y_train)
            predictions = classifier.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            model_accuracy.append(round(accuracy, 3))
            model_names.append(names[i])
            print(f'{names[i]}: {round(accuracy, 3)}')
        except Exception as e:
            print(f'Could not train {names[i]} because of {e}')
            
    df = pd.DataFrame({'Model':model_names, 'Accuracy':model_accuracy})
    df = df.sort_values(by=['Accuracy'], ascending=False)
    
    return df

In [12]:
#Model perfomance on features generated using KAZE
df = train_base_models(X_train, y_train, X_test, y_test)
df

Random Forest: 0.718
Logistic Regression: 0.692
AdaBoost: 0.615
Bagging: 0.744
Gradient Boosting: 0.564
Bernoulli NB: 0.692
Gaussian NB: 0.615
K-Nearest Neighbors: 0.718
Neural Network: 0.718
Linear SVC: 0.641
Support Vector Machine: 0.667


Unnamed: 0,Model,Accuracy
3,Bagging,0.744
0,Random Forest,0.718
7,K-Nearest Neighbors,0.718
8,Neural Network,0.718
1,Logistic Regression,0.692
5,Bernoulli NB,0.692
10,Support Vector Machine,0.667
9,Linear SVC,0.641
2,AdaBoost,0.615
6,Gaussian NB,0.615


In [13]:
#These features are gnerated using ORB
X_train2, X_test2, y_train2, y_test2 = extract_features(algorithm=1)

'NoneType' object has no attribute 'flatten'
'NoneType' object has no attribute 'flatten'
'NoneType' object has no attribute 'flatten'


In [14]:
X_train2.shape, y_train2.shape

((87, 4096), (87,))

In [15]:
X_test2.shape, y_test2.shape

((38, 4096), (38,))

In [16]:
#Model perfomance on features generated using ORB
df2 = train_base_models(X_train2, y_train2, X_test2, y_test2)
df2

Random Forest: 0.447
Logistic Regression: 0.5
AdaBoost: 0.289
Bagging: 0.342
Gradient Boosting: 0.342
Bernoulli NB: 0.474
Gaussian NB: 0.5
K-Nearest Neighbors: 0.447
Neural Network: 0.579
Linear SVC: 0.605
Support Vector Machine: 0.395


Unnamed: 0,Model,Accuracy
9,Linear SVC,0.605
8,Neural Network,0.579
1,Logistic Regression,0.5
6,Gaussian NB,0.5
5,Bernoulli NB,0.474
0,Random Forest,0.447
7,K-Nearest Neighbors,0.447
10,Support Vector Machine,0.395
3,Bagging,0.342
4,Gradient Boosting,0.342


In [None]:
#The base models perform so badly with ORB.

In [17]:
#These fatures are generated using HOG
X_train3, X_test3, y_train3, y_test3 = extract_features(algorithm=2) 

In [18]:
X_train3.shape, y_train3.shape

((89, 4096), (89,))

In [19]:
X_test3.shape, y_test3.shape

((39, 4096), (39,))

In [20]:
#Model perfomance on features generated using HOG
df3 = train_base_models(X_train3, y_train3, X_test3, y_test3)
df3

Random Forest: 0.667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: 0.667
AdaBoost: 0.359
Bagging: 0.538
Gradient Boosting: 0.538
Bernoulli NB: 0.487
Gaussian NB: 0.538
K-Nearest Neighbors: 0.487
Neural Network: 0.59
Linear SVC: 0.692
Support Vector Machine: 0.59


Unnamed: 0,Model,Accuracy
9,Linear SVC,0.692
0,Random Forest,0.667
1,Logistic Regression,0.667
8,Neural Network,0.59
10,Support Vector Machine,0.59
3,Bagging,0.538
4,Gradient Boosting,0.538
6,Gaussian NB,0.538
5,Bernoulli NB,0.487
7,K-Nearest Neighbors,0.487


In [22]:
#These fatures are generated using SIFT
X_train4, X_test4, y_train4, y_test4 = extract_features(algorithm=3) 

In [23]:
X_train4.shape, y_train4.shape

((89, 4096), (89,))

In [24]:
X_test4.shape, y_test4.shape

((39, 4096), (39,))

In [25]:
#Model perfomance on features generated using SIFT
df4 = train_base_models(X_train4, y_train4, X_test4, y_test4)
df4

Random Forest: 0.538
Logistic Regression: 0.59
AdaBoost: 0.436
Bagging: 0.513
Gradient Boosting: 0.436
Bernoulli NB: 0.641
Gaussian NB: 0.436
K-Nearest Neighbors: 0.231
Neural Network: 0.436
Linear SVC: 0.641
Support Vector Machine: 0.359


Unnamed: 0,Model,Accuracy
5,Bernoulli NB,0.641
9,Linear SVC,0.641
1,Logistic Regression,0.59
0,Random Forest,0.538
3,Bagging,0.513
2,AdaBoost,0.436
4,Gradient Boosting,0.436
6,Gaussian NB,0.436
8,Neural Network,0.436
10,Support Vector Machine,0.359


In [31]:
#These fatures are generated using BRIEF
X_train5, X_test5, y_train5, y_test5 = extract_features(algorithm=4) 

In [32]:
X_train5.shape, y_train5.shape

((77, 4096), (77,))

In [33]:
X_test5.shape, y_test5.shape

((34, 4096), (34,))

In [34]:
#Model perfomance on features generated using BRIEF
df5 = train_base_models(X_train5, y_train5, X_test5, y_test5)
df5

Random Forest: 0.412


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression: 0.441
AdaBoost: 0.324
Bagging: 0.412
Gradient Boosting: 0.471
Bernoulli NB: 0.265
Gaussian NB: 0.529
K-Nearest Neighbors: 0.324
Neural Network: 0.471
Linear SVC: 0.324
Support Vector Machine: 0.382


Unnamed: 0,Model,Accuracy
6,Gaussian NB,0.529
4,Gradient Boosting,0.471
8,Neural Network,0.471
1,Logistic Regression,0.441
0,Random Forest,0.412
3,Bagging,0.412
10,Support Vector Machine,0.382
2,AdaBoost,0.324
7,K-Nearest Neighbors,0.324
9,Linear SVC,0.324
