In [5]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings
warnings.filterwarnings('ignore')

# Enter path to the data_fusion_guest_lecture file
image_folder_path = "data"  

# Loads labels
df = pd.read_csv(os.path.join(image_folder_path, "seedling_labels.csv"))

# Creates path to top & side view
df["color_cam_path"] = image_folder_path + "/" + df["color_cam_path"]
df["side_cam_path"] = image_folder_path + "/" + df["side_cam_path"]

# Gives average expert label as a starting point
df["average_expert"] = (df["Expert 1"] + df["Expert 2"]  + df["Expert 3"] + df["Expert 4"]) / 4
print('Each row is an image pair of one color and one side view of one specific plant, together with the expert opinion (1,2 = normal, 3,4 = abnormal).')

df

Each row is an image pair of one color and one side view of one specific plant, together with the expert opinion (1,2 = normal, 3,4 = abnormal).


Unnamed: 0,Expert 1,Expert 2,Expert 3,Expert 4,color_cam_path,side_cam_path,Rfid,Pos,average_expert
0,4,4,4,4,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,A1,Plant 0000,4.00
1,1,1,1,1,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,A1,Plant 0001,1.00
2,1,1,1,1,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,A1,Plant 0002,1.00
3,4,4,3,3,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,A1,Plant 0003,3.50
4,3,1,1,1,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,A1,Plant 0004,1.50
...,...,...,...,...,...,...,...,...,...
989,1,1,1,1,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,B4,Plant 0122,1.00
990,1,1,1,1,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,B4,Plant 0123,1.00
991,1,1,1,1,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,B4,Plant 0124,1.00
992,2,3,3,3,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,B4,Plant 0125,2.75


## Generate a score using Cohen Kappa for each expert

In [6]:
from sklearn.metrics import cohen_kappa_score
import numpy as np
from collections import Counter
experts = ["Expert 1", "Expert 2", "Expert 3", "Expert 4" ]
expert_Kappas=[[],[],[],[]]
weights = []
labels = []
i=0 
j=0

for i in range(len(experts)):
    for j in range(len(experts)):
        if i!=j:
            ratings = [df[experts[i]].values.tolist(),df[experts[j]].values.tolist()]
            kappa = cohen_kappa_score(ratings[0],ratings[1])
            expert_Kappas[i].append(kappa)

for kappa in expert_Kappas:
    weights.append(np.sum(kappa)/3)
votes = df[["Expert 1", "Expert 2","Expert 3", "Expert 4"]].values.tolist()    
for vote in votes:
    weighted_votes = Counter()
    for i, vot in enumerate(vote):
        weighted_votes[vot] += weights[i]

    # get the winner(s) of the vote
    winners = [vote for vote, count in weighted_votes.items() if count == max(weighted_votes.values())]
    labels.append(winners[0])
    
df["labels"]  = labels
# Round so we can replace
df['average_expert_rounded'] = df['average_expert'].round(0).astype(np.int64)

In [7]:
df

Unnamed: 0,Expert 1,Expert 2,Expert 3,Expert 4,color_cam_path,side_cam_path,Rfid,Pos,average_expert,labels,average_expert_rounded
0,4,4,4,4,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,A1,Plant 0000,4.00,4,4
1,1,1,1,1,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,A1,Plant 0001,1.00,1,1
2,1,1,1,1,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,A1,Plant 0002,1.00,1,1
3,4,4,3,3,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,A1,Plant 0003,3.50,3,4
4,3,1,1,1,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,A1,Plant 0004,1.50,1,2
...,...,...,...,...,...,...,...,...,...,...,...
989,1,1,1,1,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,B4,Plant 0122,1.00,1,1
990,1,1,1,1,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,B4,Plant 0123,1.00,1,1
991,1,1,1,1,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,B4,Plant 0124,1.00,1,1
992,2,3,3,3,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,B4,Plant 0125,2.75,3,3


In [8]:
# create a Boolean DataFrame indicating where the two columns are equal
equal_df = df["labels"].eq(df["average_expert_rounded"])

# calculate the percentage of differences
diff_percentage = 100 - (equal_df.sum() / equal_df.count() * 100)

print(f"The percentage of differences between the two columns is: {diff_percentage:.2f}%")

diff_count = (~equal_df).sum()

print(f"The number of differences between the two columns is: {diff_count}")

The percentage of differences between the two columns is: 7.55%
The number of differences between the two columns is: 75


## Convert (1,2 = normal, 3,4 = abnormal) because it will be binary classification

In [9]:
# 0 = normal
# 1 = abnormal

df[['Expert 1', 'Expert 2', 'Expert 3', 'Expert 4', 'average_expert', 'labels']] = df[['Expert 1', 'Expert 2', 'Expert 3', 'Expert 4', 'average_expert', 'labels']].replace({1: 0, 2: 0})
df[['Expert 1', 'Expert 2', 'Expert 3', 'Expert 4', 'average_expert', 'labels']] = df[['Expert 1', 'Expert 2', 'Expert 3', 'Expert 4', 'average_expert', 'labels']].replace([3,4], 1)

df

Unnamed: 0,Expert 1,Expert 2,Expert 3,Expert 4,color_cam_path,side_cam_path,Rfid,Pos,average_expert,labels,average_expert_rounded
0,1,1,1,1,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,data/A1/00387 Plant 0000 Plant 0000/18-02-2019...,A1,Plant 0000,1.00,1,4
1,0,0,0,0,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,data/A1/00388 Plant 0001 Plant 0001/18-02-2019...,A1,Plant 0001,0.00,0,1
2,0,0,0,0,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,data/A1/00389 Plant 0002 Plant 0002/18-02-2019...,A1,Plant 0002,0.00,0,1
3,1,1,1,1,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,data/A1/00390 Plant 0003 Plant 0003/18-02-2019...,A1,Plant 0003,3.50,1,4
4,1,0,0,0,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,data/A1/00391 Plant 0004 Plant 0004/18-02-2019...,A1,Plant 0004,1.50,0,2
...,...,...,...,...,...,...,...,...,...,...,...
989,0,0,0,0,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,data/B4/01019 Plant 0122 Plant 0122/18-02-2019...,B4,Plant 0122,0.00,0,1
990,0,0,0,0,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,data/B4/01020 Plant 0123 Plant 0123/18-02-2019...,B4,Plant 0123,0.00,0,1
991,0,0,0,0,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,data/B4/01021 Plant 0124 Plant 0124/18-02-2019...,B4,Plant 0124,0.00,0,1
992,0,1,1,1,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,data/B4/01022 Plant 0125 Plant 0125/18-02-2019...,B4,Plant 0125,2.75,1,3


### Model Training

In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

## CNN using transfer learning with MobileNetV2 as the base model
The MobileNetV2 model is used as a feature extractor and then the extracted features are flattened and passed through a few dense layers with dropout before the final classification layer. The model is then trained on the input images using the ```ImageDataGenerator``` to generate batches of augmented images and passed through the model.

###### COLOR CAM, MODEL 1

In [11]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras.optimizers import Adam

# Set up a TensorFlow session to use the GPU if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    except RuntimeError as e:
        print(e)

train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator1 = train_datagen.flow_from_dataframe(
    train_df,
    x_col='color_cam_path',
    y_col='labels',
    target_size=(224, 224),
    batch_size=8,
    class_mode='raw')

test_generator1 = test_datagen.flow_from_dataframe(
    test_df,
    x_col='color_cam_path',
    y_col='labels',
    target_size=(224, 224),
    batch_size=8,
    class_mode='raw')

Found 795 validated image filenames.
Found 199 validated image filenames.


In [84]:
# load the MobileNetV2 model
mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# unfreeze the last few layers for fine-tuning
for layer in mobilenet_model.layers[:-4]:
    layer.trainable = False

# build the model
model = tf.keras.models.Sequential([
    mobilenet_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# compile the model
model.compile(optimizer=Adam(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# train the model
model.fit(train_generator, epochs=5)

Found 795 validated image filenames.
Found 199 validated image filenames.
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


###### Save model so it can be used later again without training again

In [92]:
# save the model to a file
model.save('model1_color_cam.h5') # Accurary was 0.909547746181488

###### SIDE CAM, MODEL 2

In [47]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.applications.mobilenet_v2 import MobileNetV2
from tensorflow.keras.optimizers import Adam

# Set up a TensorFlow session to use the GPU if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_virtual_device_configuration(gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
    except RuntimeError as e:
        print(e)

# Define the data generators
train_datagen = ImageDataGenerator(
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1)

test_datagen = ImageDataGenerator(rescale=1./255)

train_generator2 = train_datagen.flow_from_dataframe(
    train_df,
    x_col='side_cam_path',
    y_col='labels',
    target_size=(224, 224),
    batch_size=10,
    class_mode='raw')

test_generator2 = test_datagen.flow_from_dataframe(
    test_df,
    x_col='side_cam_path',
    y_col='labels',
    target_size=(224, 224),
    batch_size=10,
    class_mode='raw')

Found 795 validated image filenames.
Found 199 validated image filenames.


In [49]:
# load the MobileNetV2 model
mobilenet_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# unfreeze the last few layers for fine-tuning
for layer in mobilenet_model.layers[:-4]:
    layer.trainable = False

# build the model
model2 = tf.keras.models.Sequential([
    mobilenet_model,
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# compile the model
model2.compile(optimizer=Adam(lr=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# train the model
model2.fit(train_generator2, epochs=5)



Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x173b5c6aac0>

###### Save model so it can be used later again without training again

In [96]:
# save the model to a file
model2.save('model2_side_cam.h5') # Accurary was 0.8994975090026855

### Load already trained models and predict

In [54]:
from tensorflow.keras.models import load_model

model1 = load_model('model1_color_cam.h5')
model2 = load_model('model2_side_cam.h5')

In [61]:
def predict(model, test_generator):
    # predict the labels for test data
    test_generator.reset()
    pred = model.predict(test_generator)

    # convert the predictions to binary labels
    pred_labels = [1 if p >= 0.5 else 0 for p in pred]
    
    return pred_labels, pred

def accuracy(model, test_generator, model_number):
    loss, accuracy = model.evaluate(test_generator)
    print('Accuracy on test set for model number', model_number, ': ', accuracy)
    
    
pred_labels1, pred1 = predict(model1, test_generator1)
print('Prediction done, Model 1')
pred_labels2, pred2 = predict(model2, test_generator2)
print('Prediction done, Model 2')

accuracy(model1, test_generator1, '1')
accuracy(model2, test_generator2, '2')

Prediction done, Model 1
Prediction done, Model 2
Accuracy on test set for model number 1 :  0.909547746181488
Accuracy on test set for model number 2 :  0.8994975090026855


In [64]:
# The probability of each class, rounded
pred1 = np.round(pred1, 3)
pred2 = np.round(pred2, 3)

pred1_df = pd.DataFrame(pred1, columns=['pred1'])
pred2_df = pd.DataFrame(pred2, columns=['pred2'])

In [65]:
true_labels_df = pd.DataFrame({'true_labels': test_generator1.labels})
pred_labels_df1 = pd.DataFrame({'pred_labels1_color': pred_labels1})
pred_labels_df2 = pd.DataFrame({'pred_labels2_side': pred_labels2})

result_df = pd.concat([true_labels_df, pred1_df, pred_labels_df1, pred2_df, pred_labels_df2], axis=1)

result_df

Unnamed: 0,true_labels,pred1,pred_labels1_color,pred2,pred_labels2_side
0,0,0.003,0,0.000,0
1,0,0.046,0,0.002,0
2,0,0.000,0,0.000,0
3,0,0.998,1,0.001,0
4,0,0.009,0,0.998,1
...,...,...,...,...,...
194,0,0.005,0,0.000,0
195,0,0.203,0,0.000,0
196,0,0.108,0,0.000,0
197,1,0.012,0,0.000,0


### Weighted Voting for the two models

In [76]:
def weighted_vote(row):
    # higher weight for the model with higher accuracy
    w1 = 0.4
    w2 = 0.6

    vote = w1 * row['pred1'] + w2 * row['pred2']

    return round(vote, 3)

result_df['weighted_vote'] = result_df.apply(weighted_vote, axis=1)
result_df['weighted_vote_binary'] = result_df['weighted_vote'].apply(lambda x: 1 if x >= 0.5 else 0)
result_df

Unnamed: 0,true_labels,pred1,pred_labels1_color,pred2,pred_labels2_side,weighted_vote,weighted_vote_binary
0,0,0.003,0,0.000,0,0.001,0
1,0,0.046,0,0.002,0,0.020,0
2,0,0.000,0,0.000,0,0.000,0
3,0,0.998,1,0.001,0,0.400,0
4,0,0.009,0,0.998,1,0.602,1
...,...,...,...,...,...,...,...
194,0,0.005,0,0.000,0,0.002,0
195,0,0.203,0,0.000,0,0.081,0
196,0,0.108,0,0.000,0,0.043,0
197,1,0.012,0,0.000,0,0.005,0


In [77]:
# Calculate the accuracy given scikit
acc = accuracy_score(result_df['true_labels'], result_df['weighted_vote_binary'])

acc

0.6532663316582915