In [6]:
# Basic
import os
from os import makedirs
from os import listdir
from shutil import copyfile
from random import seed
from random import random
import numpy as np
import pandas as pd
import zipfile

# visuals
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.image import imread
from PIL import Image

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay

# Tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense,MaxPooling2D,Dropout,Flatten,BatchNormalization,Conv2D
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping

datasetzipfile = "datasets.zip"

# Check if the zip file exists
if os.path.exists(datasetzipfile):
    # Open the zip file in read mode
    with zipfile.ZipFile(datasetzipfile, 'r') as zip_ref:
        # Iterate through each file in the zip archive
        for file_info in zip_ref.infolist():
            # Check if the file already exists in the current directory
            if not os.path.exists(file_info.filename):
                zip_ref.extract(file_info)
else:
    print(f"{datasetzipfile} does not exist.")

train_path_dog = "datasets/train/dog"
train_path_cat = "datasets/train/cat"
valid_path_dog = "datasets/val/dog"
valid_path_cat = "datasets/val/cat"

test_path = "datasets/test"

#Count the Data Provided
def count_files_in_directory(path):
    try:
        # List all entries in the specified directory
        all_entries = os.listdir(path)
        
        # Count only the files (exclude directories)
        total_files = sum(1 for entry in all_entries if os.path.isfile(os.path.join(path, entry)))
        
        return total_files
    except Exception as e:
        return str(e)

# Specify the directory path
print("Total number of dog images in training data :", count_files_in_directory(train_path_dog))
print("Total number of cat images in training data :", count_files_in_directory(train_path_cat))
print()
print("Total number of dog images in validation data :", count_files_in_directory(valid_path_dog))
print("Total number of cat images in validation data :", count_files_in_directory(valid_path_cat))
print()
print("Total number of unknown images in test data :", count_files_in_directory(test_path))

Total number of dog images in training data : 10000
Total number of cat images in training data : 10000

Total number of dog images in validation data : 2500
Total number of cat images in validation data : 2500

Total number of unknown images in test data : 500


In [7]:
# Define your image size and batch size
image_size = 128  # or the size your model expects
image_channel = 3
bat_size = 64 # batch size of your choice

# Import the ImageDataGenerator class
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Create an ImageDataGenerator object for the validation set
validation_datagen = ImageDataGenerator(rescale=1. / 255)
# Create an ImageDataGenerator object for the test set
test_datagen = ImageDataGenerator(rescale=1. / 255)
# Create an ImageDataGenerator object for the training set

train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    rotation_range=15,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

train_generator = train_datagen.flow_from_directory("datasets/train", 
                                                    target_size = (image_size, image_size),
                                                    classes=['Cat' , 'Dog'],
                                                    class_mode='binary',
                                                    batch_size=bat_size,
                                                    #save_to_dir=aug_data_path,
                                                    #save_prefix='aug_',
                                                    #save_format="jpg",
                                                    seed = 1
                                                    
                                                    )
print(train_generator.class_indices)
print(train_generator.num_classes)
print(train_generator.samples)

Found 20000 images belonging to 2 classes.
{'Cat': 0, 'Dog': 1}
2
20000


In [8]:
validation_generator = validation_datagen.flow_from_directory("datasets/val", 
                                                    target_size = (image_size, image_size),
                                                    classes=['Cat' , 'Dog'],
                                                    class_mode='binary',
                                                    batch_size=bat_size,
                                                    seed = 1 )
print(validation_generator.class_indices)
print(validation_generator.num_classes)
print(validation_generator.samples)

Found 5000 images belonging to 2 classes.
{'Cat': 0, 'Dog': 1}
2
5000


In [9]:
# Get a list of all image filenames in the test directory
filenames = os.listdir(test_path)
filepaths = [os.path.join(test_path, fname) for fname in filenames]

# Check for invalid files by comparing filenames
valid_filenames = os.listdir(test_path)

# Create a DataFrame with filenames (no need for labels)
df_test = pd.DataFrame({
    'filename': filenames
})

# Filter out filenames that don't exist in the directory
invalid_filenames = df_test[~df_test['filename'].isin(valid_filenames)]

# Print invalid filenames
if len(invalid_filenames) > 0:
    print("Invalid filenames:")
    print(invalid_filenames['filename'])
else:
    print("All filenames are valid.")

test_generator = test_datagen.flow_from_dataframe(
    dataframe=df_test,
    directory=test_path,          # Path to the directory with images
    x_col='filename',             # Column containing the filenames
    y_col=None,                   # No labels for prediction
    target_size= (image_size, image_size),         # Resize the images
    batch_size=bat_size,
    class_mode=None,              # No labels, since we're predicting
    shuffle=False,                # Keep the order of files
    seed=1
)

All filenames are valid.
Found 500 validated image filenames.


In [10]:
# Example architecture adjustment
model = Sequential()

# Add convolutional layers (your current structure)
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten the output
model.add(Flatten())

# Adjust this layer's input size based on the flattened output
model.add(Dense(9216, activation='relu'))  # Match this number to the output of the flatten layer

# Add more dense layers if needed
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # For binary classification (cats vs dogs)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [16]:
learning_rate_reduction = ReduceLROnPlateau(monitor = 'val_accuracy',
                                            patience=2,
                                            factor=0.5,
                                            min_lr = 0.00001,
                                            verbose = 1)

early_stoping = EarlyStopping(monitor='val_loss',patience= 3,restore_best_weights=True,verbose=0)

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

cat_dog = model.fit(train_generator,
                    validation_data = validation_generator, 
                    callbacks=[early_stoping,learning_rate_reduction],
                    epochs = 15,
                    #steps_per_epoch = len(train_generator),
                    #validation_steps = len(validation_generator),
                    steps_per_epoch = 10, #312
                    validation_steps = 10, #78
                   )

  self._warn_if_super_not_called()


Epoch 1/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 14s/step - accuracy: 0.4856 - loss: 2.7461 - val_accuracy: 0.4938 - val_loss: 0.6934 - learning_rate: 0.0010
Epoch 2/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 18s/step - accuracy: 0.5283 - loss: 0.6940 - val_accuracy: 0.4766 - val_loss: 0.6934 - learning_rate: 0.0010
Epoch 3/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 13s/step - accuracy: 0.4934 - loss: 0.6937 - val_accuracy: 0.5219 - val_loss: 0.6930 - learning_rate: 0.0010
Epoch 4/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 13s/step - accuracy: 0.4801 - loss: 0.6932 - val_accuracy: 0.5109 - val_loss: 0.6931 - learning_rate: 0.0010
Epoch 5/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - accuracy: 0.4856 - loss: 0.6931 
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m167s