In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import glob
import cv2

from keras.models import Model, Sequential
from keras.layers import Dense, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    BatchNormalization, SeparableConv2D, MaxPooling2D, Activation, Flatten, Dropout, Dense
)
from tensorflow.keras import backend as K
import os
import seaborn as sns
from keras.applications.vgg16 import VGG16

# Importing data

In [None]:
data_dir = "../input/leafsnap-vgg16-2/field"

root, dirs, files = next(os.walk(data_dir), ([],[],[]))
dirs.sort()
print(len(dirs))

# Resising the image

In [None]:
def resize(fl, img_height, img_width):
    img = cv2.imread(fl)
    resized = cv2.resize(img, (img_height, img_width))
    return resized

# Making Data

In [None]:
def get_data():
    X = []
    y = []
    classes = []
    dir_names = []
    
    for dir_name in dirs[:30]:
        class_name = dir_name.replace('_','')
        classes.append(class_name)
        
        dir_names.append(dir_name)
        
        path = os.path.join(data_dir,dir_name,'*.jpg')
        images = glob.glob(path)
        
        for fl in images:
            flbase = os.path.basename(fl)
            img = resize(fl, 120, 120)
            X.append(img)
            y.append(class_name)
            
    return X, y, classes,dir_names

In [None]:
images, labels, classes,dir_names = get_data()

In [None]:
#Convert lists to arrays        
images = np.array(images)
labels = np.array(labels)

# Transforming labels to numerical using label encoder

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
le = preprocessing.LabelEncoder()
le.fit(labels)
train_labels_encoded = le.transform(labels)

#Split data into test and train datasets (already split but assigning to meaningful convention)
x_train,x_test,y_train, y_test = train_test_split(images, train_labels_encoded, test_size=0.3, random_state=0)


# VGG

In [None]:
SIZE=120
VGG_model = VGG16(weights='imagenet', include_top=False, input_shape=(SIZE, SIZE, 3))

#Make loaded layers as non-trainable. This is important as we want to work with pre-trained weights
for layer in VGG_model.layers:
	layer.trainable = False
    
VGG_model.summary()  #Trainable parameters will be 0

In [None]:
feature_extractor=VGG_model.predict(x_train)


In [None]:
features = feature_extractor.reshape(feature_extractor.shape[0], -1)

X_for_model = features #This is our X input to RF

# LogReg with features extracted using VGG

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, KFold

# Define the train/test loss metric as MSE
def mse(y,ypr):
    return np.mean((y-ypr)**2)


# define shuffled KFold crossvalidation object
kf = KFold(n_splits=5, shuffle=False)

# define you scorer
scr = make_scorer(mse)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lr_mod1 = Pipeline([
    ('mod1', LogisticRegression(penalty='none',max_iter = 10000,solver='lbfgs'))
])


lr_mod2 = Pipeline([
    ('mod2',LogisticRegression(penalty="l2", C=1,solver="sag"))
])

lr_mod3 = Pipeline([
    ('mod3', LogisticRegression(penalty='l1', solver='liblinear', C=0.01))
])

In [None]:
# Calculate mean CV scores for each model 
CV1 = cross_val_score(lr_mod1, X_for_model, y_train, cv=kf, scoring=scr).mean()
CV3 = cross_val_score(lr_mod3, X_for_model, y_train, cv=kf, scoring=scr).mean()
CV2 = cross_val_score(lr_mod2, X_for_model, y_train, cv=kf, scoring=scr).mean()


print(f"CV loss (L2 penalised Model): {CV2}")
print(f"CV loss (NonPenalised LR Model): {CV1}")
print(f"CV loss (L1 Penalised Model): {CV3}")

##### The L2 penalty shows the least CV loss. Hence we go with the `L2` penalised model

#### We need to also find the best `C` paramter

In [None]:
lam = np.exp(np.linspace(-1,1,10))
errs = np.zeros(len(lam))

for i in range(len(lam)):
    lr_mod2.set_params(mod2=LogisticRegression(C=lam[i], penalty='l2'))
    cvsc = cross_val_score(lr_mod2, X_for_model, y_train,cv=10,scoring =scr)
    errs[i] = cvsc.mean()
    print(f"C-value (1/lam): {lam[i]} | nll: {errs[i]}")
    
a = np.array([lam,errs])
best_C = a[0,np.argmin(np.abs(a),axis=1)[1]]
print(f"Optimal lambda: {best_C}")

# Let's evaluate a final model with a confusion matrix, and one additional metric
lr_mod2.set_params(mod2=LogisticRegression(C=best_C, penalty='l2'))

##### Find the `prediction` of the model

In [None]:
# #Send test data through same feature extractor process
X_test_feature = VGG_model.predict(x_test)
X_test_features = X_test_feature.reshape(X_test_feature.shape[0], -1)

In [None]:
final_modt = LogisticRegression(penalty="l2", C=best_C,solver="sag")
final_modt.fit(X_for_model,y_train)
ypred = final_modt.predict(X_test_features)
ypred

##### Find the prediction accuracy, recall & prediction scores

In [None]:
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
# from matplotlib.colors import ListedColormapim
import pandas as pd

In [None]:
# Find the prediction accuracy score
acc = accuracy_score(y_test,ypred)
print("Accuracy :", acc)

# Find the recall, precision scores
rec = recall_score(y_test,ypred, average='macro')
prec = precision_score(y_test,ypred, average='macro')
# print("\n")
print(f"Recall: {rec}, \nPrecision: {prec}")

#### Finding the ROC performance of the model

In [None]:
def plot_multiclass_roc(clf, xtest, ytest, n_classes, figsize=(17, 6)):
    y_score = clf.predict_proba(xtest)

    # structures
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    # calculate dummies once
    y_test_dummies = pd.get_dummies(ytest, drop_first=False).values
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test_dummies[:, i], y_score[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    # roc for each class
    fig, ax = plt.subplots(figsize=figsize)
    ax.plot([0, 1], [0, 1], 'k--')
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')
    ax.set_title('Receiver operating characteristic example')
    for i in range(n_classes):
        ax.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for label %i' % (roc_auc[i], i))
    ax.legend(loc="best")
    ax.grid(alpha=.4)
    sns.despine()
    plt.show()
    plt.savefig('VGG16_LR_roc.png')
    
plot_multiclass_roc(final_modt, X_test_features, y_test, n_classes=30, figsize=(10,9))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

fig, ax1 = plt.subplots(1,1, figsize=(10,7))
conf = confusion_matrix(y_test,ypred)
sns.heatmap(conf, annot=True, cbar=False, fmt="d", linewidths=0.5, cmap="plasma_r", ax=ax1)
ax1.set_title("Confusion Label")
ax1.set_xlabel("Predicted Class")
ax1.set_ylabel("Original Class")
fig.tight_layout
fig.savefig('VGG16_LR_ConfMatrix.png')

In [None]:
y_pred = le.inverse_transform(ypred)
y_test = le.inverse_transform(y_test)

print(classification_report(y_test,y_pred))