In [7]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2

from keras.models import Model, Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    BatchNormalization, SeparableConv2D, MaxPooling2D, Activation, Flatten, Dropout, Dense
)
from tensorflow.keras import backend as K
import os
import seaborn as sns
from keras.applications.vgg16 import VGG16

# Importing Data

In [8]:
data_dir = "../input/leafdataset/field"

root, dirs, files = next(os.walk(data_dir), ([],[],[]))
dirs.sort()
print(len(dirs))

# Resizing the images

In [9]:
def resize(fl, img_height, img_width):
    img = cv2.imread(fl)
    resized = cv2.resize(img, (img_height, img_width))
    return resized

# Making Data

In [10]:
def get_data():
    X = []
    y = []
    classes = []
    dir_names = []
    
    for dir_name in dirs[:30]:
        class_name = dir_name.replace('_','')
        classes.append(class_name)
        
        dir_names.append(dir_name)
        
        path = os.path.join(data_dir,dir_name,'*.jpg')
        images = glob.glob(path)
        
        for fl in images:
            flbase = os.path.basename(fl)
            img = resize(fl, 120, 120)
            X.append(img)
            y.append(class_name)
            
    return X, y, classes,dir_names

In [11]:
images, labels, classes,dir_names = get_data()

In [12]:
#Convert lists to arrays        
images = np.array(images)
labels = np.array(labels)

# Train Test Split

In [13]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
le = preprocessing.LabelEncoder()
le.fit(labels)
train_labels_encoded = le.transform(labels)

#Split data into test and train datasets (already split but assigning to meaningful convention)
x_train,x_test,y_train, y_test = train_test_split(images, train_labels_encoded, test_size=0.3, random_state=0)
###################################################################

# Downloading VGG 16

In [14]:
SIZE=120
VGG_model = VGG16(weights='imagenet', include_top=False, input_shape=(SIZE, SIZE, 3))

#Make loaded layers as non-trainable. This is important as we want to work with pre-trained weights
for layer in VGG_model.layers:
	layer.trainable = False
    
VGG_model.summary()  #Trainable parameters will be 0

# Extracting Training Features

In [15]:
feature_extractor=VGG_model.predict(x_train)

# Extracting Test Features

In [16]:
features = feature_extractor.reshape(feature_extractor.shape[0], -1)

X_for_RF = features #This is our X input to RF

# Hyperparameter Tuning

In [17]:
#Finding best hyperparameter using GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
n_estimators = [ 200, 900, 800,300]
max_depth = [2,3,6,7,10,12,14,15]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 
forest = RandomForestClassifier(random_state = 42)

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth)

gridF = GridSearchCV(forest, hyperF, cv = 10, verbose = 1, 
                      n_jobs = -1)
bestF = gridF.fit(X_for_RF, y_train)

In [18]:
best_parameters = gridF.best_params_
print(best_parameters)

# Training VGG with the best hyperparameter

In [19]:
from sklearn.ensemble import RandomForestClassifier
RF_model = RandomForestClassifier(n_estimators = 800,max_depth=15,random_state = 42)

# Train the model on training data
RF_model.fit(X_for_RF, y_train) #For sklearn no one hot encoding

#Send test data through same feature extractor process
X_test_feature = VGG_model.predict(x_test)
X_test_features = X_test_feature.reshape(X_test_feature.shape[0], -1)

In [20]:
#Now predict using the trained RF model. 
prediction_RF = RF_model.predict(X_test_features)
#Inverse le transform to get original label back. 
#prediction_RF = le.inverse_transform(prediction_RF)

#Print overall accuracy
from sklearn import metrics
print ("Accuracy = ", metrics.accuracy_score(y_test, prediction_RF))

# Plotting the confusion Matrix

In [21]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
from matplotlib.colors import ListedColormap
import seaborn as sns
fig, ax1 = plt.subplots(1,1, figsize=(10, 7))
# Plot confusion matrix
cm = confusion_matrix(y_test, prediction_RF)
sns.heatmap(cm, annot = True, cbar = False, fmt = "d", linewidths = .5, cmap = "Blues", ax = ax1)
ax1.set_title("Confusion Matrix")
ax1.set_xlabel("Predicted class")
ax1.set_ylabel("Actual class")
fig.tight_layout()

# Plotting ROC with the model

In [24]:
import pandas as pd
def plot_multiclass_roc(clf, X_test, y_test, n_classes, figsize=(17, 6)):
    y_score = clf.predict_proba(X_test)

    # structures
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(y_test, drop_first=False).values
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()
    plt.savefig('resnet_knn_roc.png')
plot_multiclass_roc(RF_model, X_test_features, y_test, n_classes=30, figsize=(10, 10))

# Inverting the numbers to label names

In [25]:
y_test= le.inverse_transform(y_test)
prediction_RF = le.inverse_transform(prediction_RF)

# Classification Report

In [26]:
print(metrics.classification_report(y_test, prediction_RF, digits=3))