# Load the dataset
Mount Google Drive and make the dataset files accesible from the notebook.

In [1]:
# adjust the paths accordingly
# comment the following out if you are in local mode
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.append("/content/drive/MyDrive/Colab Notebooks/project2-test")
drive_path = "/content/drive/MyDrive/Colab Notebooks/project2-test"

# -- uncomment the following if you are in local mode
# drive_path = "."
# -- make sure to have the "dataset2" folder in your working directory as well the my-photos folder


Mounted at /content/drive


### Import the necessary libraries
Python libraries used below in this notebook.

In [2]:
import utils
import os
import shutil
import os
import PIL
import PIL.Image
import tensorflow as tf
import numpy as np
from sklearn import metrics


### Save dataset pictures with proper structure.

In [3]:
# Setting up file paths for the dataset
data_path = drive_path + "/" + "data"
sunrise_path = drive_path + "/" + "data/sunrise"
shine_path = drive_path + "/" + "data/shine"
cloudy_path = drive_path + "/" + "data/cloudy"
rain_path = drive_path + "/" + "data/rain"
images_dict = utils.get_dict_with_files_per_class(drive_path + "/" "dataset2")

# the path for my custom photos
myphotos_path = drive_path + "/" + "my-photos"

def store_in_keras_structure():

  """
    Function to organize images into a structure suitable 
    for training a Keras model.
    It creates necessary directories and 
    moves images to their respective class directories.
  """
    
  # Create necessary directories if they don't exist
  if not(os.path.isdir(data_path)):
    os.mkdir(data_path)
    os.mkdir(shine_path)
    os.mkdir(rain_path)
    os.mkdir(sunrise_path)
    os.mkdir(cloudy_path)

    # Move and rename images into their respective class directories

    # Handling sunrise class
    i = 1
    for image in images_dict['sunrise']:
      new_filename = "sunrise_image_{0}.jpg".format(i)
      shutil.move(image, sunrise_path + "/" + new_filename)
      i += 1
    
    # Handling cloudy class
    i = 1
    for image in images_dict['cloudy']:
        new_filename = "cloudy_image_{0}.jpg".format(i)
        shutil.move(image, cloudy_path + "/" + new_filename)
        i += 1
    
    # Handling rain class
    i = 1
    for image in images_dict['rain']:
        new_filename = "rain_image_{0}.jpg".format(i)
        shutil.move(image, rain_path + "/" + new_filename)
        i += 1
    
    # Handling shine class
    i = 1
    for image in images_dict['shine']:
        new_filename = "shine_image_{0}.jpg".format(i)
        shutil.move(image, shine_path + "/" + new_filename)
        i += 1
        
store_in_keras_structure()

### Create train, validation and test subsets.

Using image_dataset_from_directory from Keras to load the data. We create a 60% train subset, a 20% validation subset and a 20% test subset.

In [4]:
BATCH_SIZE = 64
# Select a fixed size (256, 256) for the loaded images
IMG_HEIGHT = 256
IMG_WIDTH = 256

# Load the training dataset
train_ds = tf.keras.utils.image_dataset_from_directory(
    data_path,
    validation_split=0.4,
    labels="inferred",
    label_mode="categorical",
    subset="training",
    seed=1,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
)

# Load the test dataset
test_ds = tf.keras.utils.image_dataset_from_directory(
    data_path,
    validation_split=0.4,
    labels="inferred",
    label_mode="categorical",
    subset="validation",
    seed=1,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
)

# Determine the number of validation batches
val_batches = tf.data.experimental.cardinality(test_ds)
print(val_batches)

# Create a validation dataset by taking the first half of the test dataset
val_ds = test_ds.take((1*val_batches) // 2)
# Update the test dataset to skip the first half and keep the second half
test_ds = test_ds.skip((1*val_batches) // 2)

# Get the number of classes
NUM_CLASSES = len(train_ds.class_names)
print(NUM_CLASSES)

Found 1125 files belonging to 4 classes.
Using 675 files for training.
Found 1125 files belonging to 4 classes.
Using 450 files for validation.
tf.Tensor(8, shape=(), dtype=int64)
4


# Create a metrics function

The ***confusion_matrix*** function calculates the confusion matrix for the requested subset.

In [5]:
def confusion_matrix(model, subset):

    # Set the appropriate dataset based on the subset
    if subset=="test":
        ds = test_ds
    elif subset=="val":
        ds = val_ds
    elif subset=="train":
        ds = train_ds
    elif subset=="myphotos":
       ds = myphotos_ds

    y_true = []
    y_pred = []
    
    # x: for every iteration, x contains a batch of 64 images (256,256), RGB 
    # y: for every iteration, y contains the corresponding labels (one_hot) for each batch
    for x, y in ds:
        # argmax and axis = 1 go through every one-hot combination 
        # and return the index with the highest value   
        y = tf.argmax(y, axis=1)
        # append the result to the array with the true labels
        y_true.append(y)
        # do the same for the predicted labels        
        pred = model.predict(x)
        y_pred.append(tf.argmax(pred, axis=1))
    
    # we concatenate the batches into one 
    y_pred = tf.concat(y_pred, axis=0)
    y_true = tf.concat(y_true, axis=0)

    # compute and return the confusion matrix
    return metrics.confusion_matrix(y_true, y_pred)

The ***calculate_metrics*** function takes as input the confusion matrix and then calculates and finally prints both the overall and per class evaluation metrics (accuracy, precision, recall and f1-score).

In [6]:
def calculate_metrics(cm):
  # Confusion Matrix
  print("-----------------------------")
  print(cm)
  print("-----------------------------")
  # Overall Accuracy
  total_samples = sum(sum(row) for row in cm)
  accuracy = (cm[0][0] + cm[1][1] + cm[2][2] + cm[3][3]) / total_samples
  print("Overall Accuracy:", accuracy)

  num_classes = len(cm)
  precision = []
  recall = []
  f1_score = []
  class_accuracy = []

  # Calculate accuracy, precision, recall, and F1-score for each class
  for i in range(num_classes):
    class_total = sum(cm[i])  # Total samples for the class
    class_correct = cm[i][i]  # Correctly classified samples for the class
    class_accuracy.append(class_correct / class_total)

    overall_tp = sum(cm[i][i] for i in range(num_classes))
    overall_fp = sum(sum(cm[j][i] for j in range(num_classes) if j != i) for i in range(num_classes))
    overall_fn = sum(sum(cm[i][j] for j in range(num_classes) if j != i) for i in range(num_classes))
    
    # True positives for class i
    tp = cm[i][i]  
    # False positives for class i
    fp = sum(cm[j][i] for j in range(num_classes) if j != i)
    # False negatives for class i  
    fn = sum(cm[i][j] for j in range(num_classes) if j != i)

    # Precision for class i
    precision.append(tp / (tp + fp) if tp + fp > 0 else 0)
    # Recall for class i  
    recall.append(tp / (tp + fn) if tp + fn > 0 else 0)  
    # F1-score for class i
    f1_score.append((2 * precision[i] * recall[i]) / (precision[i] + recall[i]) if (precision[i] + recall[i]) > 0 else 0)

  # Calculate overall precision, recall, and F1-score
  overall_precision = overall_tp / (overall_tp + overall_fp) if (overall_tp + overall_fp) > 0 else 0
  overall_recall = overall_tp / (overall_tp + overall_fn) if (overall_tp + overall_fn) > 0 else 0
  overall_f1_score = (2 * overall_precision * overall_recall) / (overall_precision + overall_recall) if (overall_precision + overall_recall) > 0 else 0

  # Print overall precision, recall, and F1-score
  print("Overall Precision:", round(overall_precision, 5))
  print("Overall Recall:", round(overall_recall, 5))
  print("Overall F1-score:", round(overall_f1_score, 5))
  print("-----------------------------")
  # Print accuracy, precision, recall, and F1-score per class
  print("Accuracy per class:", [f"{val:.5f}" for val in class_accuracy])
  print("Precision per class:", [f"{val:.5f}" for val in precision])
  print("Recall per class:", [f"{val:.5f}" for val in recall])
  print("F1 score per class:", [f"{val:.5f}" for val in f1_score])
  print("-----------------------------")

The ***model_custom_eval*** function takes as input a given model and then calculates the metrics for all the model's subsets (train, validation and test).

In [7]:
def model_custom_eval(model): 
  # Calculate confusion matrices for train, val, and test sets
  cm_train = confusion_matrix(model, "train")
  cm_val = confusion_matrix(model, "val")
  cm_test = confusion_matrix(model, "test")
  
  # Print evaluation results
  print("<========== Train ==========>")
  calculate_metrics(cm_train)
  print("<========== Val ============>")
  calculate_metrics(cm_val)
  print("<========== Test ===========>")
  calculate_metrics(cm_test)
  print("<===========================>")

# Create a simple CNN model

### Architecture
We implement a simple CNN model architecture that follows the project guidelines.

In [None]:
def cnn_simple(num_classes):
    return tf.keras.Sequential(
        [
            # Rescale pixel values to [0, 1]
            tf.keras.layers.Rescaling(1.0 / 255),
            
            tf.keras.layers.Conv2D(8, 3, padding="same",activation="relu"),
            tf.keras.layers.MaxPooling2D(strides=(2,2)),
            tf.keras.layers.Conv2D(16, 3, padding="same",activation="relu"),
            tf.keras.layers.MaxPooling2D(strides=(2,2)),
            # Flatten the feature maps
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(32, activation="relu"),
            tf.keras.layers.Dense(num_classes, activation="softmax"),
        ]
    )

### Training
We compile the model using, again, the project guidelines for the various parameters.

In [None]:
cnn_simple_model = cnn_simple(NUM_CLASSES)

cnn_simple_model.compile(
  optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99),
  loss='categorical_crossentropy',
  metrics=['accuracy'])



We train the model and use early stopping if there is no further decrease of validation loss after 5 epochs.

In [None]:
early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

cnn_simple_model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=20,
  callbacks=[early_stop_cb]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20


<keras.callbacks.History at 0x7f0a99936ec0>

### Evaluate
We use our custom evaluation function to evaluate the model and display the necessary metrics.

In [None]:
# Evaluate the model
model_custom_eval(cnn_simple_model)

-----------------------------
[[175   0  13   0]
 [  7 122   0   0]
 [ 11   0 129   0]
 [  1   0   2 215]]
-----------------------------
Overall Accuracy: 0.9496296296296296
Overall Precision: 0.94963
Overall Recall: 0.94963
Overall F1-score: 0.94963
-----------------------------
Accuracy per class: ['0.93085', '0.94574', '0.92143', '0.98624']
Precision per class: ['0.90206', '1.00000', '0.89583', '1.00000']
Recall per class: ['0.93085', '0.94574', '0.92143', '0.98624']
F1 score per class: ['0.91623', '0.97211', '0.90845', '0.99307']
-----------------------------
-----------------------------
[[57  1  1  1]
 [ 8 42  1  0]
 [ 7  3 63  0]
 [ 0  0  1 71]]
-----------------------------
Overall Accuracy: 0.91015625
Overall Precision: 0.91016
Overall Recall: 0.91016
Overall F1-score: 0.91016
-----------------------------
Accuracy per class: ['0.95000', '0.82353', '0.86301', '0.98611']
Precision per class: ['0.79167', '0.91304', '0.95455', '0.98611']
Recall per class: ['0.95000', '0.82353', '

### Results

The CNN model achieved excellent accuracy on the training set (**94.96%**) as well as on the validation and test subsets (**~91%**).

Class specific analysis revealed slightly lower accuracy for the "rain" class on the validation subset (0.82) and for the "shine" class on the test subset (0.80) 


  # Create a CNN model of bigger depth

### Architecture
We implement the more complex CNN model architecture following the project guidelines.

In [None]:
def cnn_complex(num_classes):
    return tf.keras.Sequential(
        [
            tf.keras.layers.Rescaling(1.0 / 255),
            tf.keras.layers.Conv2D(32, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(32, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(32, 3, padding="same", activation="relu"),
            tf.keras.layers.MaxPooling2D(strides=(4, 4)),
            tf.keras.layers.Conv2D(64, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(64, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(64, 3, padding="same",activation="relu"),
            tf.keras.layers.MaxPooling2D(strides=(2, 2)),
            tf.keras.layers.Conv2D(128, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(128, 3, padding="same", activation="relu"),
            tf.keras.layers.Conv2D(128, 3, padding="same",activation="relu"),
            tf.keras.layers.MaxPooling2D(strides=(2, 2)),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(num_classes, activation="softmax"),
        ]
    )

### Training
We compile the model following the project instructions and use the same parameters as before.

In [None]:
cnn_complex_model = cnn_complex(NUM_CLASSES)

cnn_complex_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.99),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
)

We train the model using early stoppping after 5 epochs. 

In [None]:
early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

cnn_complex_model.fit(train_ds, validation_data=val_ds, epochs=20, callbacks=[early_stop_cb])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


<keras.callbacks.History at 0x7f0a8050d3f0>

### Evaluate
We use our custom evaluation function to evaluate the complex model and display the evaluation metrics.

In [None]:
# Evaluate the model
model_custom_eval(cnn_complex_model)

-----------------------------
[[170   6  12   0]
 [ 58  69   2   0]
 [ 19   1 120   0]
 [  4   0   4 210]]
-----------------------------
Overall Accuracy: 0.8429629629629629
Overall Precision: 0.84296
Overall Recall: 0.84296
Overall F1-score: 0.84296
-----------------------------
Accuracy per class: ['0.90426', '0.53488', '0.85714', '0.96330']
Precision per class: ['0.67729', '0.90789', '0.86957', '1.00000']
Recall per class: ['0.90426', '0.53488', '0.85714', '0.96330']
F1 score per class: ['0.77449', '0.67317', '0.86331', '0.98131']
-----------------------------
-----------------------------
[[57  4  2  0]
 [29 23  0  1]
 [12  1 51  0]
 [ 3  0  2 71]]
-----------------------------
Overall Accuracy: 0.7890625
Overall Precision: 0.78906
Overall Recall: 0.78906
Overall F1-score: 0.78906
-----------------------------
Accuracy per class: ['0.90476', '0.43396', '0.79688', '0.93421']
Precision per class: ['0.56436', '0.82143', '0.92727', '0.98611']
Recall per class: ['0.90476', '0.43396', '0

### Results
The complex model, despite having a more sophisticated architecture, does not perform as well as the simple model.

The overall accuracy (84% on train, 78% on validation and test) is not as good as the simple model.

Observing the per-class results, the "rain" class performs badly and achieves the lowest accuracy, precision and recall compared to the others.

### Complex vs Simple Architecture

For our dataset of 1152 images, the complex cnn model seems to have a larger capacity than necessary and it has difficulty capturing the essential features and patterns from the images.

Another reason should be the training data quality, because the complex model seems to be more sensitive to incosistencies and misalabeled samples, especially considering the images belonging to the "rain" class or those that closely resemble "rain" images but actually belong to the other classes. 

# Utilize a pre-trained neural network 
For this task we select the InceptionV3 architecture.

### Preprocessing
Documentation from Keras API on https://keras.io/api/applications/inceptionv3 indicates that we need to scale input pixels between -1 and 1 before we feed them to InceptionV3. This is done by using **inception_v3.preprocess_input**. 

In [8]:
from tensorflow.keras.applications.inception_v3 import InceptionV3

# Function to preprocess images and labels
def preprocess_func(images, labels):
    return tf.keras.applications.inception_v3.preprocess_input(images), labels

# Map the preprocessing function to the subsets
train_ds = train_ds.map(preprocess_func)
val_ds = val_ds.map(preprocess_func)
test_ds = test_ds.map(preprocess_func)

### Architecture

We will utilize the InceptionV3 architecture in 4 ways.

The following function provides the flexibility to toggle the "freezing" of the pretrained model and the inclusion of an additional dense layer with dropout.

With the help of this function we can train and evaluate the model using 4 combinations and check the results.

In [9]:
def pretrained_inception_v3(num_classes, is_trainable, extra_layers_enable):
  
  # Load the pre-trained InceptionV3 model with weights from ImageNet
  pretrained_model = InceptionV3(
    input_shape=(299, 299, 3), include_top=False, weights="imagenet"
)

  # Set the trainability of the pre-trained model based on the input parameter
  pretrained_model.trainable = is_trainable 

  # Define the input layer for the model
  input_layer = tf.keras.layers.Input(shape=(256, 256, 3))

  # Resize the input images to match the input size of InceptionV3 (299x299)
  # without aspect ratio distortion.
  # https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/smart_resize
  resizing_layer = (
    lambda image: tf.keras.preprocessing.image.smart_resize(image, (299, 299))
  )(input_layer)

  # Pass the resized images through the pre-trained InceptionV3 layers
  inception_layers = pretrained_model(resizing_layer)

  # Add a global spatial average pooling layer
  x = tf.keras.layers.GlobalAveragePooling2D()(inception_layers)

  # Add extra fully-connected layers if enabled
  if extra_layers_enable == True:
    # add a fully-connected layer
    layer_i = tf.keras.layers.Dense(1024, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.3)(layer_i)

  out_layer = tf.keras.layers.Dense(4, activation="softmax")(x)

  # Define the model with input and output layers
  model = tf.keras.Model(inputs=input_layer, outputs=out_layer)
  return model

def train_inception_v3(num_classes, is_trainable, extra_layers_enable):
  model = pretrained_inception_v3(num_classes, is_trainable=is_trainable, extra_layers_enable=extra_layers_enable)

  model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001, beta_1=0.9, beta_2=0.99),
    loss="categorical_crossentropy",
    metrics=["accuracy"],
  )

  early_stop_cb = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)

  model.fit(
    train_ds, validation_data=val_ds, epochs=20, callbacks=[early_stop_cb]
  )
  return model

### Training

Try freezing the inception V3 and use the two extra layers (one Dense Layer with 1024 neurons(relu) and one Dropout Layer) 

In [12]:
model1 = train_inception_v3(NUM_CLASSES, False, True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20


Freezing the inception V3 but not using the two extra layers.

In [None]:
model2 = train_inception_v3(NUM_CLASSES, False, False)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Enable training of the inception V3 but not using the two extra layers.

In [None]:
model3 = train_inception_v3(NUM_CLASSES, True, False)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_v3/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Enable training of the inception V3 but use the two extra layers.

In [None]:
model4 = train_inception_v3(NUM_CLASSES, True, True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


### Evaluate

Evaluating the 4 combinations of the models

In [None]:
# Evaluate the model
model_custom_eval(model1)

(array([0.99470899, 1.        , 1.        , 1.        ]), array([1.        , 0.99224806, 1.        , 1.        ]), array([0.99734748, 0.99610895, 1.        , 1.        ]), array([188, 129, 140, 218]))
(array([0.91803279, 1.        , 0.98507463, 0.96153846]), array([0.94915254, 0.98039216, 0.94285714, 0.98684211]), array([0.93333333, 0.99009901, 0.96350365, 0.97402597]), array([59, 51, 70, 76]))
(array([1.        , 1.        , 0.91666667, 0.96825397]), array([0.90909091, 1.        , 1.        , 0.98387097]), array([0.95238095, 1.        , 0.95652174, 0.976     ]), array([55, 33, 44, 62]))
-----------------------------
[[188   0   0   0]
 [  1 128   0   0]
 [  0   0 140   0]
 [  0   0   0 218]]
-----------------------------
Overall Accuracy: 0.9985185185185185
Overall Precision: 0.99852
Overall Recall: 0.99852
Overall F1-score: 0.99852
-----------------------------
Accuracy per class: ['1.00000', '0.99225', '1.00000', '1.00000']
Precision per class: ['0.99471', '1.00000', '1.00000', '1.0

In [None]:
# Evaluate the model
model_custom_eval(model2)

(array([0.87234043, 0.96183206, 0.92622951, 0.8974359 ]), array([0.87234043, 0.97674419, 0.80714286, 0.96330275]), array([0.87234043, 0.96923077, 0.86259542, 0.92920354]), array([188, 129, 140, 218]))
(array([0.84313725, 0.96      , 0.93939394, 0.84269663]), array([0.74137931, 1.        , 0.86111111, 0.96153846]), array([0.78899083, 0.97959184, 0.89855072, 0.89820359]), array([58, 48, 72, 78]))
(array([0.88      , 1.        , 0.94736842, 0.86153846]), array([0.84615385, 1.        , 0.81818182, 0.98245614]), array([0.8627451 , 1.        , 0.87804878, 0.91803279]), array([52, 41, 44, 57]))
-----------------------------
[[164   3   6  15]
 [  1 126   1   1]
 [ 17   2 113   8]
 [  6   0   2 210]]
-----------------------------
Overall Accuracy: 0.9081481481481481
Overall Precision: 0.90815
Overall Recall: 0.90815
Overall F1-score: 0.90815
-----------------------------
Accuracy per class: ['0.87234', '0.97674', '0.80714', '0.96330']
Precision per class: ['0.87234', '0.96183', '0.92623', '0.8

In [None]:
# Evaluate the model
model_custom_eval(model3)

-----------------------------
[[188   0   0   0]
 [  0 129   0   0]
 [  0   0 140   0]
 [  0   0   0 218]]
-----------------------------
Overall Accuracy: 1.0
Overall Precision: 1.0
Overall Recall: 1.0
Overall F1-score: 1.0
-----------------------------
Accuracy per class: ['1.00000', '1.00000', '1.00000', '1.00000']
Precision per class: ['1.00000', '1.00000', '1.00000', '1.00000']
Recall per class: ['1.00000', '1.00000', '1.00000', '1.00000']
F1 score per class: ['1.00000', '1.00000', '1.00000', '1.00000']
-----------------------------
-----------------------------
[[53  0  0  0]
 [ 0 56  0  0]
 [ 4  0 65  0]
 [ 0  0  0 78]]
-----------------------------
Overall Accuracy: 0.984375
Overall Precision: 0.98438
Overall Recall: 0.98438
Overall F1-score: 0.98438
-----------------------------
Accuracy per class: ['1.00000', '1.00000', '0.94203', '1.00000']
Precision per class: ['0.92982', '1.00000', '1.00000', '1.00000']
Recall per class: ['1.00000', '1.00000', '0.94203', '1.00000']
F1 score

In [None]:
# Evaluate the model
model_custom_eval(model4)

(array([1., 1., 1., 1.]), array([1., 1., 1., 1.]), array([1., 1., 1., 1.]), array([188, 129, 140, 218]))
(array([0.97222222, 1.        , 0.92727273, 1.        ]), array([0.95890411, 0.98076923, 0.98076923, 0.98734177]), array([0.96551724, 0.99029126, 0.95327103, 0.99363057]), array([73, 52, 52, 79]))
(array([0.93333333, 1.        , 0.97619048, 1.        ]), array([0.97674419, 1.        , 0.93181818, 1.        ]), array([0.95454545, 1.        , 0.95348837, 1.        ]), array([43, 35, 44, 72]))
-----------------------------
[[188   0   0   0]
 [  0 129   0   0]
 [  0   0 140   0]
 [  0   0   0 218]]
-----------------------------
Overall Accuracy: 1.0
Overall Precision: 1.0
Overall Recall: 1.0
Overall F1-score: 1.0
-----------------------------
Accuracy per class: ['1.00000', '1.00000', '1.00000', '1.00000']
Precision per class: ['1.00000', '1.00000', '1.00000', '1.00000']
Recall per class: ['1.00000', '1.00000', '1.00000', '1.00000']
F1 score per class: ['1.00000', '1.00000', '1.00000',

### Results
After carefully analyzing the results, it becomes evident that each combination yields impressive overall accuracies.

For the training subset, all combinations, except for model1 (freezing enabled, no extra Dense Layer), achieve near-perfect accuracies, hovering around 1.0. Model1 achieves a commendable accuracy of approximately 99%.

Shifting our focus to the test set, model1, model3, and model4 all achieve accuracies of around 97%. These results indicate that whether we make the model trainable with no extra dense layers or vice versa, we end up with the same overall accuracy of 97%. However, enabling both the trainability of the model and the utilization of the extra dense layer results in a slight 1% increase in accuracy (model4).

On the other hand, the combination that does not enable either of the two options (model2) exhibits the poorest performance on the test set, with an accuracy of approximately 91%.

The same goes for the validation set. Apart from model2, which once again delivers the lowest accuracy, the remaining combinations achieve an overall accuracy of approximately 97%, with minor variations of around ±1% among the models.



# Testing the models with external images 

In the final phase, we put our models to the test using a brand new dataset consisting of photos captured from our phone.

We handpicked a total of 12 photos, with 3 images representing each class of our classification task.

Our objective is to assess the performance of three models: the simple CNN model, the more complex one, and the pretrained InceptionV3 model (with trainability enabled and no extra layers added).

In [None]:
BATCH_SIZE = 64
IMG_HEIGHT = 256
IMG_WIDTH = 256

myphotos_ds = tf.keras.utils.image_dataset_from_directory(
    myphotos_path,
    labels="inferred",
    label_mode="categorical",
    seed=1,
    image_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
)


val_batches = tf.data.experimental.cardinality(myphotos_ds)
print(val_batches)

myphotos_ds.class_names
NUM_CLASSES = len(myphotos_ds.class_names)
print(myphotos_ds.class_names
)

Found 12 files belonging to 4 classes.
tf.Tensor(1, shape=(), dtype=int64)
['cloudy', 'rain', 'shine', 'sunrise']


###  Evaluating the simple model

In [None]:
cm_myphotos = confusion_matrix(cnn_simple_model, "myphotos")
print("<========== My Photos ==========>")
calculate_metrics(cm_myphotos)

-----------------------------
[[3 0 0 0]
 [1 2 0 0]
 [0 1 2 0]
 [2 0 0 1]]
-----------------------------
Overall Accuracy: 0.6666666666666666
Overall Precision: 0.66667
Overall Recall: 0.66667
Overall F1-score: 0.66667
-----------------------------
Accuracy per class: ['1.00000', '0.66667', '0.66667', '0.33333']
Precision per class: ['0.50000', '0.66667', '1.00000', '1.00000']
Recall per class: ['1.00000', '0.66667', '0.66667', '0.33333']
F1 score per class: ['0.66667', '0.66667', '0.80000', '0.50000']
-----------------------------


### Evaluating the complex model

In [None]:
cm_myphotos = confusion_matrix(cnn_complex_model, "myphotos")
print("<========== My Photos ==========>")
calculate_metrics(cm_myphotos)

-----------------------------
[[2 0 0 1]
 [0 3 0 0]
 [0 2 1 0]
 [0 0 0 3]]
-----------------------------
Overall Accuracy: 0.75
Overall Precision: 0.75
Overall Recall: 0.75
Overall F1-score: 0.75
-----------------------------
Accuracy per class: ['0.66667', '1.00000', '0.33333', '1.00000']
Precision per class: ['1.00000', '0.60000', '1.00000', '0.75000']
Recall per class: ['0.66667', '1.00000', '0.33333', '1.00000']
F1 score per class: ['0.80000', '0.75000', '0.50000', '0.85714']
-----------------------------


### Evaluating the pretrained InceptionV3 model

In [None]:
myphotos_ds = myphotos_ds.map(preprocess_func)
cm_myphotos = confusion_matrix(model3, "myphotos")
print("<========== My Photos ==========>")
calculate_metrics(cm_myphotos)

-----------------------------
[[2 0 1 0]
 [0 2 1 0]
 [0 0 3 0]
 [0 0 0 3]]
-----------------------------
Overall Accuracy: 0.8333333333333334
Overall Precision: 0.83333
Overall Recall: 0.83333
Overall F1-score: 0.83333
-----------------------------
Accuracy per class: ['0.66667', '0.66667', '1.00000', '1.00000']
Precision per class: ['1.00000', '1.00000', '0.60000', '1.00000']
Recall per class: ['0.66667', '0.66667', '1.00000', '1.00000']
F1 score per class: ['0.80000', '0.80000', '0.75000', '1.00000']
-----------------------------


### Results
Upon analyzing the results, it becomes apparent that the performance of the models varies across the different architectures.

The simple model exhibits the lowest accuracy (66%), which is considerably lower compared to its previous evaluation on the test and validation sets. The major challenge lies in differentiating between the sunrise and cloudy classes, as a significant number of sunrise images were mistakenly classified as cloudy.

The complex model performs slightly better (75%) than the simple architecture, showing an improvement compared to their previous comparison. However, it struggles with classifying images in the "shine" category, with a substantial number of images being misclassified as "rain."

In contrast, the pretrained model achieves the best accuracy (83%) and emerges as a reliable performer in this scenario.

### Comments
Considering the custom labeling we applied to our own photos, it is evident that both the simple and complex models are sensitive to the quality of the training data. This sensitivity stems primarily from the limited size of our training dataset. Even slight variations in the images can have a significant impact on their accuracies, revealing their limitations in making precise predictions.

In contrast, the pretrained model, having been pretrained on the ImageNet dataset, showcases promising performance in our classification problem. It demonstrates robustness and appears to be less influenced by the quality of our training data.

In conclusion, for real-world applications, it would be advisable to place trust in the pretrained model while disregarding the other architectures. However, the ideal scenario would involve training the model on a more extensive dataset than the one we utilized, allowing for improved generalization and better performance across various situations.