In [38]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [8]:
#for array manipulations
import numpy as np
import pandas as pd
#for image processing
import cv2 
#for displaying images
import matplotlib.pyplot as plt
#to display images in this notebook, not in a separate window
%matplotlib inline
#to access system resources such as directories
import os
#This wilallow us to get the training time of each model
import time

In [4]:
os.getcwd()

'C:\\Users\\USER\\Documents\\GitHub\\maize-disease-detection\\notebooks'

In [5]:
#set our base directory. This should point to the location of the plant-diseases folder
base_dir = 'C:\\Users\\USER\\Documents\\GitHub\\maize-disease-detection'
data_folder = os.path.join(base_dir, 'data')
maize_data_folder = os.path.join(data_folder, 'maize')

In [21]:
#This function loads 32 images of a particular disease
def get_32(disease):
    '''
    disease:
        A string that could be common_rust, healthy, leaf_spot, nothern_leaf_blight
    ........
    disease_images:
        A list of images for the selected disease
    '''
    #this list will contain the 20 images returned
    disease_images = []
    #path to the images
    disease_images_path = os.path.join(maize_data_folder, disease)
    for image_path in os.listdir(disease_images_path):
        image_path = os.path.join(disease_images_path, image_path)
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image,cv2.COLOR_BGR2RGB)
        disease_images.append(image)
    return disease_images

#This function will help us plot 10 images
def plot_images(images, title):
    '''
    images: List
        List of images
    title: String
        Title for each image i.e name of disease
    '''
    plt.figure(figsize=(15,6))
    for i in range(10):
        plt.subplot(2,5, i+1)
        plt.imshow(images[i])
        plt.title(title)
        plt.xticks([])
        plt.yticks([])
    plt.show()
    
#This function generates ORB features
def extract_features_orb(image, vector_size=32):
    try:
        feature_generator = cv2.ORB_create()
        orb_keypoints = feature_generator.detect(image)
        orb_keypoints = orb_keypoints[:32]
        orb_keypoints, orb_descriptors = feature_generator.compute(image, orb_keypoints)
        orb_descriptors = orb_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if orb_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            orb_descriptors = np.concatenate([orb_descriptors, np.zeros(needed_size - orb_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return orb_descriptors

#This function generates KAZE features
def extract_features_kaze(image, vector_size=32):
    try:
        feature_generator = cv2.KAZE_create()
        kaze_keypoints = feature_generator.detect(image)
        kaze_keypoints = kaze_keypoints[:32]
        kaze_keypoints, kaze_descriptors = feature_generator.compute(image, kaze_keypoints)
        kaze_descriptors = kaze_descriptors.flatten()
        #The descriptor vector size is 128
        needed_size = (vector_size*128)
        if kaze_descriptors.size < needed_size:
            #If we have less than 32 keypoints, add zeros to the end of our vector
            kaze_descriptors = np.concatenate([kaze_descriptors, np.zeros(needed_size - kaze_descriptors.size)])
    except cv2.error as e:
        print(f'Error: {e}')
        return None
    return kaze_descriptors

def extract_features_hog(image, feature_size=4096):
    hog = cv2.HOGDescriptor()
    features = hog.compute(image)
    required_features = features[:feature_size].ravel()
    return required_features

#Let us extraxt KAZE features
def extract_features(algorithm=0):
    '''
    Algorithm:
        1 for ORB
        0 for KAZE
        2 for HOG
    '''
    #Now let us perform these steps for all the 32 images loaded
    #This will contain all our images
    all_images = []
    #This will contain all our labels
    all_labels = []
    labels = ['common_rust', 'healthy', 'leaf_spot', 'nothern_leaf_blight']
    for i, image_folder in enumerate([common_rust_images, healthy_images, leaf_spot_images, nothern_leaf_blight_images]):
        for image in image_folder:
            all_images.append(image)
            all_labels.append(labels[i])
    features, labels = [], []
    for i, image in enumerate(all_images):
        image_features = []
        try:
            if algorithm == 1:
                image_features = extract_features_orb(image)
            elif algorithm == 0:
                image_features = extract_features_kaze(image)
            else:
                image_features = extract_features_hog(image)
            image_label = all_labels[i]
            features.append(image_features)
            labels.append(image_label)
        except AttributeError as e:
            print(e)
    features = np.array(features)
    labels = np.array(labels)
    features = StandardScaler().fit_transform(features)
    labels = LabelEncoder().fit_transform(labels)
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)
    
    return X_train, X_test, y_train, y_test

#Let us extraxt KAZE features
def extract_features2(algorithm=0):
    '''
    Algorithm:
        1 for ORB
        0 for KAZE
        2 for HOG
    '''
    #Now let us perform these steps for all the 32 images loaded
    #This will contain all our images
    all_images = []
    #This will contain all our labels
    all_labels = []
    labels = ['common_rust', 'healthy', 'leaf_spot', 'nothern_leaf_blight']
    for i, image_folder in enumerate([common_rust_images, healthy_images, leaf_spot_images, nothern_leaf_blight_images]):
        for image in image_folder:
            all_images.append(image)
            all_labels.append(labels[i])
    features, labels = [], []
    for i, image in enumerate(all_images):
        image_features = []
        try:
            if algorithm == 1:
                image_features = extract_features_orb(image)
            elif algorithm == 0:
                image_features = extract_features_kaze(image)
            else:
                image_features = extract_features_hog(image)
            image_label = all_labels[i]
            features.append(image_features)
            labels.append(image_label)
        except AttributeError as e:
            print(e)
    features = np.array(features)
    labels = np.array(labels)
    features = StandardScaler().fit_transform(features)
    labels = LabelEncoder().fit_transform(labels)
    
    return features, labels

In [31]:
'''
We have decided to choose these 6 models. They are easy to understand and work with.
We will pick the five best for each data set, i,e five for HOG and five for KAZE.
We dropped ORB since its perfomance was not very good. 
'''
models = [
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(solver='lbfgs', multi_class='auto'),
    KNeighborsClassifier(),
    LinearSVC(),
    SVC(gamma='scale'),
    DecisionTreeClassifier()
]
names = [
    'Random Forest Classifier',
    'Logistic Regression',
    'K-Nearest Neighbors',
    'Linear SVC',
    'Support Vector Classifier',
    'Decision Tree'
]

#This method gives us a rough idea about the accuracy of the base models and their raining times
def train_base_models(X_train, y_train, X_test, y_test):
    '''
    '''
    model_accuracy = []
    train_time = []
    model_names = []
    for i, classifier in enumerate(models):
        try:
            #Let us train the model and get the training time
            start_time = time.time()
            classifier.fit(X_train, y_train)
            stop_time = time.time()
            train_time.append(stop_time - start_time)
            predictions = classifier.predict(X_test)
            accuracy = accuracy_score(y_test, predictions)
            model_accuracy.append(round(accuracy, 3))
            model_names.append(names[i])
            print(f'{names[i]}: {round(accuracy, 3)}')
        except Exception as e:
            print(f'Could not train {names[i]} because of {e}')
    df = pd.DataFrame({'Model':model_names, 'Accuracy':model_accuracy, 'Train Time':train_time})
    df = df.sort_values(by=['Accuracy'], ascending=False)
    return df

#This method gives us a rough idea about the accuracy of the base models and their raining times
def train_base_models2(features, labels):
    '''
    '''
    #We use cross-validation
    model_accuracy = []
    train_time = []
    model_names = []
    
    for i, classifier in enumerate(models):
        try:
            #Let us train the model and get the training time
            start_time = time.time()
            scores = cross_val_score(classifier, features, labels, scoring='accuracy', cv=10)
            stop_time = time.time()
            train_time.append(stop_time - start_time)
            model_accuracy.append(round(np.mean(scores), 3))
            model_names.append(names[i])
            print(f'{names[i]}: {round(np.mean(scores), 3)}')
        except Exception as e:
            print(f'Could not train {names[i]} because of {e}')
    df = pd.DataFrame({'Model':model_names, 'Accuracy':model_accuracy, 'Train Time':train_time})
    df = df.sort_values(by=['Accuracy'], ascending=False)
    return df

In [11]:
common_rust_images = get_32('common_rust')
healthy_images = get_32('healthy')
leaf_spot_images = get_32('leaf_spot')
nothern_leaf_blight_images = get_32('nothern_leaf_blight')

In [12]:
#These fatures are generated using KAZE
kaze_X_train, kaze_X_test, kaze_y_train, kaze_y_test = extract_features() 
#These fatures are generated using HOG
hog_X_train, hog_X_test, hog_y_train, hog_y_test = extract_features(algorithm=2) 

In [16]:
kaze = train_base_models(kaze_X_train, kaze_y_train, kaze_X_test, kaze_y_test)
kaze

Random Forest Classifier: 0.667
Logistic Regression: 0.564
K-Nearest Neighbors: 0.59
Linear SVC: 0.564
Support Vector Classifier: 0.513
Decision Tree: 0.564


Unnamed: 0,Model,Accuracy,Train Time
0,Random Forest Classifier,0.667,0.219866
2,K-Nearest Neighbors,0.59,0.021986
1,Logistic Regression,0.564,0.388759
3,Linear SVC,0.564,0.087945
5,Decision Tree,0.564,0.049968
4,Support Vector Classifier,0.513,0.052968


In [None]:
'''
The five best models are
1. Random Forest
2. K-Nearest Neighbors
3. Logistic Regression
4. Linear SVC
5. Decision Tree
'''

In [15]:
hog = train_base_models(hog_X_train, hog_y_train, hog_X_test, hog_y_test)
hog

Random Forest Classifier: 0.513


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression: 0.59
K-Nearest Neighbors: 0.615
Linear SVC: 0.615
Support Vector Classifier: 0.641
Decision Tree: 0.615


Unnamed: 0,Model,Accuracy,Train Time
4,Support Vector Classifier,0.641,0.055964
2,K-Nearest Neighbors,0.615,0.022987
3,Linear SVC,0.615,0.861413
5,Decision Tree,0.615,0.102062
1,Logistic Regression,0.59,0.421176
0,Random Forest Classifier,0.513,0.277828


In [None]:
'''
The five best models are:
1. Support Vector classifier
2. K-Nearest Neighbors
3. Linear SVC
4. Decision Tree
5. Logistic Regression
'''

In [22]:
X, y = extract_features2()

In [23]:
X.shape, y.shape

((128, 4096), (128,))

In [32]:
kaze2 = train_base_models2(X, y)
kaze2

Random Forest Classifier: 0.767
Logistic Regression: 0.681
K-Nearest Neighbors: 0.658
Linear SVC: 0.651
Support Vector Classifier: 0.649
Decision Tree: 0.586


Unnamed: 0,Model,Accuracy,Train Time
0,Random Forest Classifier,0.767,2.582177
1,Logistic Regression,0.681,3.15093
2,K-Nearest Neighbors,0.658,0.447729
3,Linear SVC,0.651,1.118498
4,Support Vector Classifier,0.649,0.943958
5,Decision Tree,0.586,0.735572


In [None]:
'''
Best 3 models using cross-validation:
1. Random Forest Classifier
2. Logistic Regression
3. K Nearest Neighbors
4. Linear svc
5. Support Vector clasifier
'''

In [33]:
hog2 = train_base_models2(X,y)
hog2

Random Forest Classifier: 0.721
Logistic Regression: 0.681
K-Nearest Neighbors: 0.658
Linear SVC: 0.651
Support Vector Classifier: 0.649
Decision Tree: 0.603


Unnamed: 0,Model,Accuracy,Train Time
0,Random Forest Classifier,0.721,2.626375
1,Logistic Regression,0.681,3.183029
2,K-Nearest Neighbors,0.658,0.436732
3,Linear SVC,0.651,1.170277
4,Support Vector Classifier,0.649,0.974397
5,Decision Tree,0.603,0.732579


In [None]:
'''
Top 5 best models using cross-validation:
1. Random Forest Classifier
2. Logistic Regression
3. K-Nearest Neighbor
4.linear svc
5 Support Vector Classifier
'''

In [None]:
'''
We will work with the 6 models generated through cross validation
'''

In [36]:
'''
We begin with the top models generated by cross vaidation for kaze.
We finnd the optimal hyperparameters for each model.
We then pick the top three. These will be analyzed in depth, including train time and errors made.
'''
random_forest = RandomForestClassifier()
logistic_regression = LogisticRegression()
k_nearest = KNeighborsClassifier()
linear_svc = LinearSVC()
svc = SVC()

kfold = KFold(n_splits=10, random_state=10, shuffle=True)

In [39]:
#Random forest possible parameters
random_forest

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
rf_param_grid = [
    {
        'n_estimators':[10,30,60,100,200],
        'criterion':['gini', 'entropy'],
        'max_features':[100, 500, 1000, 2000, 4000]
    }
    ,
    {
        'bootstrap':[False],
        'n_estimators':[10, 50, 300],
        'criterion':['gini', 'entropy'],
        'max_features':[100, 100, 4000]
    }
]
rf_grid_search = GridSearchCV(
    random_forest,
    rf_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
rf_grid_search.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,...
                                              warm_start=False),
             iid='deprecated', n_jobs=None,
  

In [42]:
rf_grid_search.best_params_

{'criterion': 'entropy', 'max_features': 100, 'n_estimators': 60}

In [43]:
rf_grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features=100,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=60,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [44]:
best_rf = grid_search.best_estimator_
scores = cross_val_score(best_rf, X,y, scoring='accuracy', cv=10)
scores

array([0.84615385, 0.84615385, 0.84615385, 0.69230769, 0.84615385,
       0.61538462, 0.76923077, 0.53846154, 0.66666667, 0.83333333])

In [45]:
scores.mean()

0.75

In [81]:
lr_param_grid = [
    {
        'solver':['liblinear'],
        'penalty':['l1', 'l2'],
        'tol':[1e-2, 1e-6],
        'C':[0.01, 0.1, 0.7],
        'max_iter':[500,1000,10000]
    },
    {
        'solver':['lbfgs'],
        'penalty':['l2'],
        'tol':[1e-1, 1e-7],
        'C':[0.001, 0.5, 0.9],
        'max_iter':[200, 2000, 5000] 
    }
]
lr_grid_search = GridSearchCV(
    logistic_regression,
    lr_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
lr_grid_search.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.01, 0.1, 0.7], 'max_iter': [500, 1000, 10000],
                          'penalty': ['l1', 'l2'], 'solver': ['liblinear'],
                          'tol': [0.01, 1e-06]},
                         {'C': [0.001, 0.5, 0.9], 'max_iter': [200, 2000, 5000],
                          'penalty': ['l2'], 'solver': ['lbfgs'],

In [82]:
lr_grid_search.best_params_

{'C': 0.7,
 'max_iter': 500,
 'penalty': 'l1',
 'solver': 'liblinear',
 'tol': 0.01}

In [83]:
lr_grid_search.best_estimator_

LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.01, verbose=0,
                   warm_start=False)

In [84]:
best_lr = LogisticRegression(C=0.7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.01, verbose=0,
                   warm_start=False)
scores = cross_val_score(best_lr, X,y, scoring='accuracy', cv=10)
scores

array([1.        , 0.69230769, 0.69230769, 0.53846154, 0.61538462,
       0.76923077, 0.69230769, 0.61538462, 0.83333333, 0.66666667])

In [85]:
scores.mean()

0.7115384615384616

In [58]:
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [59]:
knn_param_grid = [
    {
        'n_neighbors':[2,5,10],
        'weights':['uniform', 'distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p':[1,2]
    }
]
knn_grid_search = GridSearchCV(
    knn,
    knn_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
knn_grid_search.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid=[{'algorithm': ['auto', 'ball_tree', 'kd_tree',
                                        'brute'],
                          'n_neighbors': [2, 5, 10], 'p': [1, 2],
                          'weights': ['uniform', 'distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [60]:
knn_grid_search.best_params_

{'algorithm': 'auto', 'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}

In [61]:
knn_grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=1,
                     weights='uniform')

In [62]:
best_knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=1,
                     weights='uniform')
scores = cross_val_score(best_knn, X,y, scoring='accuracy', cv=10)
scores

array([0.76923077, 0.76923077, 0.69230769, 0.61538462, 0.53846154,
       0.53846154, 0.92307692, 0.61538462, 0.75      , 0.83333333])

In [63]:
scores.mean()

0.7044871794871794

In [64]:
linear_svc

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [75]:
linear_svc_param_grid = [
    {
        'penalty':['l2'],
        'loss':['hinge', 'squared_hinge'],
        'tol':[1e-2, 1e-4, 1e-6],
        'C':[0.0001, 0.01, 0.1,0.5, 0.9],
        'max_iter':[500,1000, 5000, 10000]
    }
]
linear_svc_grid_search = GridSearchCV(
    linear_svc,
    linear_svc_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
linear_svc_grid_search.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=1000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=None, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.0001, 0.01, 0.1, 0.5, 0.9],
                          'loss': ['hinge', 'squared_hinge'],
                          'max_iter': [500, 1000, 5000, 10000],
                          'penalty': ['l2'], 'tol': [0.01, 0.0001, 1e-06]}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='accuracy', verbose=0)

In [76]:
linear_svc_grid_search.best_params_

{'C': 0.0001, 'loss': 'hinge', 'max_iter': 500, 'penalty': 'l2', 'tol': 0.01}

In [77]:
linear_svc_grid_search.best_estimator_

LinearSVC(C=0.0001, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=500, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.01, verbose=0)

In [104]:
best_linear_svc = LinearSVC(C=0.0001, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='hinge', max_iter=500, multi_class='ovr',
          penalty='l2', random_state=None, tol=0.01, verbose=0)
scores = cross_val_score(best_linear_svc, X,y, scoring='accuracy', cv=10)
scores

array([0.61538462, 0.76923077, 0.53846154, 0.61538462, 0.61538462,
       0.61538462, 0.61538462, 0.53846154, 0.83333333, 0.66666667])

In [105]:
scores.mean()

0.6423076923076924

In [86]:
svc

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [88]:
svc_param_grid = [
    {
        'kernel':['linear'],
        'C':[0.0001, 0.1, 0.5, 0.9],
        'tol':[1e-1, 1e-3, 1e-5]
    },
    {
        'kernel':['poly'],
        'degree':[1,3,6,9],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-4, 1e-6]
    },
    {
        'kernel':['rbf'],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-7, 1e-5]
    },
    {
        'kernel':['sigmoid'],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-3, 1e-9]
    }
]
svc_grid_search = GridSearchCV(
    svc,
    svc_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
svc_grid_search.fit(X,y)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.0001, 0.1, 0.5, 0.9], 'kernel': ['linear'],
                          'tol': [0.1, 0.001, 1e-05]},
                         {'degree': [1, 3, 6, 9], 'gamma': ['scale', 'auto'],
                          'kernel': ['poly'], 'tol': [0.1, 0.0001, 1e-06]},
                         {'gamma': ['scale', 'auto'], 'kernel': ['rbf'],
                          'tol': [0.1, 1e-07, 1e-05]},
                         {'gamma': ['scale', 'auto'], 'kernel': ['sigmoid'],
                          'tol'

In [89]:
svc_grid_search.best_params_

{'gamma': 'auto', 'kernel': 'rbf', 'tol': 0.1}

In [90]:
svc_grid_search.best_estimator_

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.1,
    verbose=False)

In [91]:
svc_best = SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.1,
    verbose=False)
scores = cross_val_score(svc_best, X,y, scoring='accuracy', cv=10)
scores

array([0.69230769, 0.61538462, 0.69230769, 0.61538462, 0.46153846,
       0.61538462, 0.76923077, 0.69230769, 0.83333333, 0.66666667])

In [92]:
scores.mean()

0.6653846153846155

In [94]:
X2, y2 = extract_features2(algorithm=2)
X2.shape, y2.shape

((128, 4096), (128,))

In [None]:
#HOG

In [None]:
rf_param_grid = [
    {
        'n_estimators':[10,30,60,100,200],
        'criterion':['gini', 'entropy'],
        'max_features':[100, 500, 1000, 2000, 4000]
    }
    ,
    {
        'bootstrap':[False],
        'n_estimators':[10, 50, 300],
        'criterion':['gini', 'entropy'],
        'max_features':[100, 100, 4000]
    }
]
rf_grid_search = GridSearchCV(
    random_forest,
    rf_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
rf_grid_search.fit(X2,y2)

In [None]:
rf_grid_search.best_params_

In [None]:
rf_grid_search.best_estimator_

In [97]:
lr_param_grid = [
    {
        'solver':['liblinear'],
        'penalty':['l1', 'l2'],
        'tol':[1e-2, 1e-6],
        'C':[0.01, 0.1, 0.7],
        'max_iter':[500,1000,10000]
    },
    {
        'solver':['lbfgs'],
        'penalty':['l2'],
        'tol':[1e-1, 1e-7],
        'C':[0.001, 0.5, 0.9],
        'max_iter':[200, 2000, 5000] 
    }
]
lr_grid_search = GridSearchCV(
    logistic_regression,
    lr_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
lr_grid_search.fit(X2,y2)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid=[{'C': [0.01, 0.1, 0.7], 'max_iter': [500, 1000, 10000],
                          'penalty': ['l1', 'l2'], 'solver': ['liblinear'],
                          'tol': [0.01, 1e-06]},
                         {'C': [0.001, 0.5, 0.9], 'max_iter': [200, 2000, 5000],
                          'penalty': ['l2'], 'solver': ['lbfgs'],

In [98]:
lr_grid_search.best_params_

{'C': 0.5, 'max_iter': 200, 'penalty': 'l2', 'solver': 'lbfgs', 'tol': 0.1}

In [99]:
lr_grid_search.best_estimator_

LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.1, verbose=0,
                   warm_start=False)

In [106]:
best_lr = LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.1, verbose=0,
                   warm_start=False)
scores = cross_val_score(best_lr, X2,y2, scoring='accuracy', cv=10)
scores

array([0.92307692, 0.61538462, 0.92307692, 0.84615385, 0.84615385,
       0.61538462, 0.61538462, 0.69230769, 0.66666667, 0.75      ])

In [107]:
scores.mean()

0.7493589743589744

In [None]:
knn_param_grid = [
    {
        'n_neighbors':[2,5,10],
        'weights':['uniform', 'distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p':[1,2]
    }
]
knn_grid_search = GridSearchCV(
    knn,
    knn_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
knn_grid_search.fit(X2,y2)

In [None]:
linear_svc_param_grid = [
    {
        'penalty':['l2'],
        'loss':['hinge', 'squared_hinge'],
        'tol':[1e-2, 1e-4, 1e-6],
        'C':[0.0001, 0.01, 0.1,0.5, 0.9],
        'max_iter':[500,1000, 5000, 10000]
    }
]
linear_svc_grid_search = GridSearchCV(
    linear_svc,
    linear_svc_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
linear_svc_grid_search.fit(X2,y2)

In [None]:
svc_param_grid = [
    {
        'kernel':['linear'],
        'C':[0.0001, 0.1, 0.5, 0.9],
        'tol':[1e-1, 1e-3, 1e-5]
    },
    {
        'kernel':['poly'],
        'degree':[1,3,6,9],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-4, 1e-6]
    },
    {
        'kernel':['rbf'],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-7, 1e-5]
    },
    {
        'kernel':['sigmoid'],
        'gamma':['scale', 'auto'],
        'tol':[1e-1, 1e-3, 1e-9]
    }
]
svc_grid_search = GridSearchCV(
    svc,
    svc_param_grid,
    cv=5,
    scoring='accuracy',
    return_train_score=True
)
svc_grid_search.fit(X2,y2)