In [30]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
import os
import cv2
from sklearn.model_selection import train_test_split
from tqdm import tqdm  # import tqdm for progress bar 
from tensorflow.keras.preprocessing.image import load_img, img_to_array

In [31]:
target_labels = ['Potato___Early_blight', 'Potato___healthy', 'Potato___Late_blight', 'Strawberry___healthy', 'Strawberry___Leaf_scorch']

In [32]:
main_path = 'data_penyakit/dataset/train'

In [33]:
# Load the images and labels
filename = []
X = []
y = []
for label in target_labels:
    label_dir = os.path.join(main_path, label)
    for img_file in os.listdir(label_dir):
        img_path = os.path.join(label_dir, img_file)
        img = load_img(img_path, target_size=(128, 128))
        img_array = img_to_array(img)
        filename.append(img_file)
        X.append(img_array)
        y.append(target_labels.index(label))

In [34]:
from tensorflow.keras.utils import to_categorical

In [35]:
X = np.array(X)
y = to_categorical(y, num_classes=len(target_labels))

In [36]:
X.shape

(9300, 128, 128, 3)

In [37]:
y.shape

(9300, 5)

In [38]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test, filename_train, filename_test = train_test_split(X, y, filename, test_size=0.2, random_state=42)

In [39]:
X_train.shape

(7440, 128, 128, 3)

In [40]:
X_test.shape

(1860, 128, 128, 3)

In [41]:
y_train.shape

(7440, 5)

In [42]:
y_test.shape

(1860, 5)

In [43]:
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import BatchNormalization
from keras.models import Sequential

In [44]:
len(target_labels)

5

In [45]:
cnn_model = Sequential()
cnn_model.add(Conv2D(32, kernel_size= (3,3), activation = 'relu',input_shape=(128,128,3)))
cnn_model.add(MaxPooling2D(pool_size =(2,2,)))
cnn_model.add(BatchNormalization())
cnn_model.add(Conv2D(64,kernel_size= (3,3), activation = 'relu'))
cnn_model.add(MaxPooling2D(pool_size =(2,2,)))
cnn_model.add(BatchNormalization())
cnn_model.add(Conv2D(64,kernel_size= (3,3), activation = 'relu'))
cnn_model.add(MaxPooling2D(pool_size =(2,2,)))
cnn_model.add(BatchNormalization())
cnn_model.add(Conv2D(96,kernel_size= (3,3), activation = 'relu'))
cnn_model.add(MaxPooling2D(pool_size =(2,2,)))
cnn_model.add(BatchNormalization())
cnn_model.add(Conv2D(32,kernel_size= (3,3), activation = 'relu'))
cnn_model.add(MaxPooling2D(pool_size =(2,2,)))
cnn_model.add(BatchNormalization())

cnn_model.add(Dropout(0.2))
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation = 'relu'))
cnn_model.add(Dropout(0.3))
cnn_model.add(Dense(len(target_labels), activation='softmax'))
cnn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [46]:
# Train the CNN model
cnn_history = cnn_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [47]:
# Evaluate the performance of the trained CNN model on the test set
cnn_scores = cnn_model.evaluate(X_test, y_test, verbose=0)
print("CNN Model Accuracy: %.2f%%" % (cnn_scores[1] * 100))

CNN Model Accuracy: 97.53%


In [48]:
cnn_scores

[0.07827834039926529, 0.9752688407897949]

In [49]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Use the model to predict the test dataset
y_pred_prob = cnn_model.predict(X_test)

# Convert probabilities to classes
y_pred = np.argmax(y_pred_prob, axis=1)

# If y_test is one-hot encoded, convert it to classes as well
y_test_classes = np.argmax(y_test, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_test_classes, y_pred)

# Calculate precision
precision = precision_score(y_test_classes, y_pred, average='weighted') 

# Calculate recall
recall = recall_score(y_test_classes, y_pred, average='weighted') 

# Calculate F1 score
f1 = f1_score(y_test_classes, y_pred, average='weighted')

# Print the metrics
print("Accuracy: %.2f%%" % (accuracy * 100))
print("Precision: %.2f%%" % (precision * 100))
print("Recall: %.2f%%" % (recall * 100))
print("F1 score: %.2f%%" % (f1 * 100))

Accuracy: 97.53%
Precision: 97.58%
Recall: 97.53%
F1 score: 97.52%


In [50]:
from tabulate import tabulate

# Create a DataFrame with true labels, predicted labels, and target labels
df = pd.DataFrame({'Filename': filename_test,
                   'True Labels': [target_labels[label] for label in y_test_classes],
                   'Predicted Labels': [target_labels[label] for label in y_pred]})

# Convert DataFrame to tabular format
table = tabulate(df, headers='keys', tablefmt='psql')

# Print the table
print(table)

# Save the DataFrame to a CSV file
df.to_csv('labels_predictions-cnn-kentang+strawberry.csv', index=False)

+------+-------------------------------------------------------------------------+--------------------------+--------------------------+
|      | Filename                                                                | True Labels              | Predicted Labels         |
|------+-------------------------------------------------------------------------+--------------------------+--------------------------|
|    0 | f5f41985-23db-4902-96a1-db4d7f95a26a___RS_LB 3177_180deg.JPG            | Potato___Late_blight     | Potato___Late_blight     |
|    1 | 357426c8-5b7b-4d56-9cb0-13cfaecc219f___RS_Early.B 7326_flipTB.JPG       | Potato___Early_blight    | Potato___Early_blight    |
|    2 | f3b01730-9365-4b94-a477-9b95d959ccd8___RS_HL 4555_90deg.JPG             | Strawberry___healthy     | Strawberry___healthy     |
|    3 | 02c8ff21-4e0a-4326-ba8f-089e5cb45b74___RS_LB 4089.JPG                   | Potato___Late_blight     | Potato___Late_blight     |
|    4 | 5f6e61a5-c917-43f7-978c-e1c7febd

In [51]:
# Filter rows where true label and predicted label do not match
false_predictions = df[df['True Labels'] != df['Predicted Labels']]

# Convert DataFrame to tabular format
table = tabulate(false_predictions, headers='keys', tablefmt='psql')

# Print the false predictions table
print("False Predictions:")
print(table)

# Save the false predictions to a CSV file
false_predictions.to_csv('false_predictions-cnn-kentang+strawberry.csv', index=False)

False Predictions:
+------+----------------------------------------------------------------------+-----------------------+--------------------------+
|      | Filename                                                             | True Labels           | Predicted Labels         |
|------+----------------------------------------------------------------------+-----------------------+--------------------------|
|   30 | 675d6618-29f2-41e7-8b0b-d7facdf9cd67___RS_HL 2105_new30degFlipLR.JPG | Strawberry___healthy  | Potato___healthy         |
|  120 | 65ce255d-cc5b-43e3-b44a-5b65b8230e07___RS_HL 1784_270deg.JPG         | Strawberry___healthy  | Potato___healthy         |
|  122 | 889b0741-2110-47a9-a2d2-90194e5d7676___RS_LB 4787_flipTB.JPG         | Potato___Late_blight  | Potato___Early_blight    |
|  166 | fdf53f6c-db92-48b9-9e06-0358e4114be6___RS_LB 2656_180deg.JPG         | Potato___Late_blight  | Potato___healthy         |
|  219 | 6cab0a2b-a2d9-4798-8043-f6ef416ed745___RS_LB 4993.JPG  

In [52]:
# Extract the features using the trained CNN model
cnn_features = cnn_model.predict(X_train)



# rf

In [53]:
import time

In [54]:
# import the Random Forest classifier
from RF_Manual import RandomForest

In [55]:
# Train a Random Forest classifier on the extracted features
start_time = time.time()
rf_classifier = RandomForest(n_trees=100, max_depth=42)
rf_classifier.fit(cnn_features, np.argmax(y_train, axis=1))

# count time for training
end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")

Training time: 67.5129919052124 seconds


In [56]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the performance of the Random Forest classifier on the test set
rf_features = cnn_model.predict(X_test)
rf_features = rf_features.reshape(rf_features.shape[0], -1)
rf_predictions = rf_classifier.predict(rf_features)
rf_accuracy = np.mean(rf_predictions == np.argmax(y_test, axis=1))
print("Random Forest Classifier Accuracy: %.2f%%" % (rf_accuracy * 100))

# Calculate and print the Precision
rf_precision = precision_score(np.argmax(y_test, axis=1), rf_predictions, average='weighted')
print("Random Forest Classifier Precision: %.2f" % rf_precision)

# Calculate and print the Recall
rf_recall = recall_score(np.argmax(y_test, axis=1), rf_predictions, average='weighted')
print("Random Forest Classifier Recall: %.2f" % rf_recall)

# Calculate and print the F1 score
rf_f1_score = f1_score(np.argmax(y_test, axis=1), rf_predictions, average='weighted')
print("Random Forest Classifier F1 Score: %.2f" % rf_f1_score)

Random Forest Classifier Accuracy: 98.28%
Random Forest Classifier Precision: 0.98
Random Forest Classifier Recall: 0.98
Random Forest Classifier F1 Score: 0.98


In [57]:
# Create a DataFrame with true labels and predicted labels
df_rf = pd.DataFrame({'Filename': filename_test,
                      'True Labels': [target_labels[label] for label in y_test_classes],
                      'Predicted Labels': [target_labels[label] for label in rf_predictions]})

# Convert DataFrame to tabular format
table_rf = tabulate(df_rf, headers='keys', tablefmt='psql')

# Print the table
print(table_rf)

# Save the DataFrame to a CSV file
df_rf.to_csv('labels_predictions-rf-kentang+strawberry.csv', index=False)

+------+-------------------------------------------------------------------------+--------------------------+--------------------------+
|      | Filename                                                                | True Labels              | Predicted Labels         |
|------+-------------------------------------------------------------------------+--------------------------+--------------------------|
|    0 | f5f41985-23db-4902-96a1-db4d7f95a26a___RS_LB 3177_180deg.JPG            | Potato___Late_blight     | Potato___Late_blight     |
|    1 | 357426c8-5b7b-4d56-9cb0-13cfaecc219f___RS_Early.B 7326_flipTB.JPG       | Potato___Early_blight    | Potato___Early_blight    |
|    2 | f3b01730-9365-4b94-a477-9b95d959ccd8___RS_HL 4555_90deg.JPG             | Strawberry___healthy     | Strawberry___healthy     |
|    3 | 02c8ff21-4e0a-4326-ba8f-089e5cb45b74___RS_LB 4089.JPG                   | Potato___Late_blight     | Potato___Late_blight     |
|    4 | 5f6e61a5-c917-43f7-978c-e1c7febd

In [58]:
# Filter rows where true label and predicted label do not match
false_predictions_rf = df_rf[df_rf['True Labels'] != df_rf['Predicted Labels']]

# Convert DataFrame to tabular format
table_rf = tabulate(false_predictions_rf, headers='keys', tablefmt='psql')

# Print the false predictions table
print("False Predictions:")
print(table_rf)

# Save the false predictions to a CSV file
false_predictions_rf.to_csv('false_predictions-rf-kentang+strawberry.csv', index=False)

False Predictions:
+------+----------------------------------------------------------------------+-----------------------+--------------------------+
|      | Filename                                                             | True Labels           | Predicted Labels         |
|------+----------------------------------------------------------------------+-----------------------+--------------------------|
|  143 | f686133a-e89a-4242-a52d-02f32ffd5275___RS_Early.B 8295.JPG           | Potato___Early_blight | Potato___Late_blight     |
|  219 | 6cab0a2b-a2d9-4798-8043-f6ef416ed745___RS_LB 4993.JPG                | Potato___Late_blight  | Potato___Early_blight    |
|  264 | 8829e413-5a7a-4680-b873-e71dfa9dbfe4___RS_LB 3974.JPG                | Potato___Late_blight  | Potato___healthy         |
|  268 | 541b768f-ec74-4f60-8e5a-778a5eb359da___RS_Early.B 8647_180deg.JPG    | Potato___Early_blight | Strawberry___Leaf_scorch |
|  331 | 76ebfd7a-b3d8-4cde-b6b6-7e5132ec269b___RS_LB 4935.JPG  