In [1]:
import os
import cv2

# Specify the path to the folder containing JPG files
folder_path = 'BTTAIxNYBG-train/BTTAIxNYBG-train'

In [2]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('BTTAIxNYBG-train.csv')

# Display the first few rows of the DataFrame to verify that it was loaded correctly
print(df.head())


   uniqueID           classLabel  classID source             imageFile
0         2   occluded-specimens        8      L  a1a8b48e8cb142b3.jpg
1         3    microscope-slides        6      L  79599db2ac9092b6.jpg
2         4  illustrations-color        2    BHL  c449696f2f0d0d92.jpg
3         5  illustrations-color        2      P  80a8f4a393b4e08c.jpg
4         6     animal-specimens        0     AK  041a1c6e73313638.jpg


# Data Understanding

In [None]:
print("Shape of DataFrame:", df.shape)
print("Missing values:\n", df.isnull().sum())
duplicate_rows = df[df.duplicated()]
if duplicate_rows.empty:
    print("No duplicate rows.")
else:
    print("Duplicate rows:\n", duplicate_rows)
print("Summary statistics for numerical columns:\n", df.describe())
categorical_columns = ['classLabel', 'source']
for column in categorical_columns:
    print("Unique values in", column, ":", df[column].unique())

In [None]:
import matplotlib.pyplot as plt

# List of categorical columns
categorical_columns = ['classLabel', 'source']

# Plot pie charts for each categorical column
for column in categorical_columns:
    # Count the frequency of each category
    category_counts = df[column].value_counts()
    
    # Plotting
    plt.figure(figsize=(8, 6))
    plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140)
    plt.title("Pie chart of " + column)
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
    plt.show()


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import random
import numpy as np

# Check if TensorFlow is using GPU acceleration
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# Check which device TensorFlow is currently using
print("Device:", tf.test.gpu_device_name())


Num GPUs Available:  1
Device: /device:GPU:0


# Data Preprocessing

In [None]:
# Assuming df is your DataFrame containing image information
# Assuming you have a directory 'images' containing the actual images
training_proportion = 0.01
num_images_for_training = int(len(df) * training_proportion)
training_filenames = random.sample(df['imageFile'].tolist(), num_images_for_training)
print("Number of files for training: " + str(len(training_filenames)))

# Step 1: Data Preprocessing
# Load images and preprocess them
image_data = []
y_data = []
for i, filename in enumerate(training_filenames):
    image = load_img('BTTAIxNYBG-train/BTTAIxNYBG-train/' + filename, target_size=(224, 224))  # Assuming resizing to 224x224
    image = img_to_array(image)
    image_data.append(image)
    y_data.append(df[df['imageFile'] == filename]['classID'].values[0])  # Collect class ID for the current image
     # Print statement to show progress
    print("Processed {} out of {} images for training.".format(i+1, len(training_filenames)))
X = np.array(image_data)
X = X / 255.0  # Normalization

# Convert class labels to categorical format
y = to_categorical(y_data)

print("Image preprocessing completed\n")

In [None]:
print(len(y))
print(len(X))

In [None]:
# Step 2: Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("The data has been split\n")

# Model Selection

In [5]:
# Step 3: Choosing a Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(10, activation='softmax')  # Assuming 10 classes
])
for layer in model.layers:
    print(f"Layer name: {layer.name}, Layer: {layer}")
print("Model architecture chosen\n")

Layer name: conv2d, Layer: <keras.layers.convolutional.conv2d.Conv2D object at 0x00000215BAEDBD00>
Layer name: max_pooling2d, Layer: <keras.layers.pooling.max_pooling2d.MaxPooling2D object at 0x00000215BAEDB850>
Layer name: conv2d_1, Layer: <keras.layers.convolutional.conv2d.Conv2D object at 0x000002158610D370>
Layer name: max_pooling2d_1, Layer: <keras.layers.pooling.max_pooling2d.MaxPooling2D object at 0x00000215E4253880>
Layer name: flatten, Layer: <keras.layers.reshaping.flatten.Flatten object at 0x000002158610D190>
Layer name: dense, Layer: <keras.layers.core.dense.Dense object at 0x000002158610DB20>
Layer name: dense_1, Layer: <keras.layers.core.dense.Dense object at 0x000002158610DBE0>
Model architecture chosen



In [12]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_2 (Conv2D)           (None, 222, 222, 32)      896       
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 111, 111, 32)     0         
 2D)                                                             
                                                                 
 conv2d_3 (Conv2D)           (None, 109, 109, 64)      18496     
                                                                 
 max_pooling2d_3 (MaxPooling  (None, 54, 54, 64)       0         
 2D)                                                             
                                                                 
 flatten_1 (Flatten)         (None, 186624)            0         
                                                                 
 dense_2 (Dense)             (None, 64)               

# Model Training

Only run this if the Tensorflow Model needs to be trained and can not be loaded from checkpoint

In [None]:
# Step 4: Model Training
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=5, batch_size=2, validation_data=(X_test, y_test),callbacks=[cp_callback] )
model.save("my_model")


In [None]:
# Step 5: Model Evaluation
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_accuracy)

# Load Model in from CheckPoint

In [6]:
checkpoint_path = "training_1/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

In [7]:
latest = tf.train.latest_checkpoint(checkpoint_dir)

In [8]:
# Load the previously saved weights
model.load_weights(latest)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x215bb1026d0>

# Final Test Set Evaluation and Writing to File

Need to evaluate to upload to Kaggle

In [9]:
import csv

df_2 = pd.read_csv('BTTAIxNYBG-test.csv')
testing_filenames = df_2['imageFile'].tolist()
unique_IDs = df_2['uniqueID'].tolist()
total_samples = len(testing_filenames)
subset_proportion = 15
subset_size = total_samples // subset_proportion
print(len(testing_filenames))
batch_size = 8
my_file_endings ="abcdefghijklmnop"

for i in range(0, subset_proportion):
    start_idx = i * subset_size
    end_idx = (i+1) * subset_size
    data_subset = testing_filenames[start_idx:end_idx]
    print("Temp shape of data_subset:" , len(data_subset))
    print(f"Processed subset {i+1}: {len(data_subset)}")
    image_data = []
    for j, filename in enumerate(data_subset):
        # print(f"Shape of image {j + 1}: {image.shape}")
        image = load_img('BTTAIxNYBG-test/BTTAIxNYBG-test/' + filename, target_size=(224, 224))  # Assuming resizing to 224x224
        image = img_to_array(image)
        image_data.append(image)
        # print("Processed {} out of {} images for testing.".format(i+1, subset_size))
    print("Length of image data: ", len(image_data))
    image_data = np.array(image_data)
    # image_data = np.expand_dims(image_data, axis=0)  # Add batch dimension
    test_dataset = tf.data.Dataset.from_tensor_slices(image_data)
    # test_dataset = tf.data.Dataset.from_tensor_slices(img_to_array(load_img('BTTAIxNYBG-test/BTTAIxNYBG-test/65f1c96cf4e064b8.jpg', target_size=(224, 224))))
    test_dataset = test_dataset.batch(batch_size)
    print("Size of test_dataset:", tf.data.experimental.cardinality(test_dataset).numpy())
    predictions = model.predict(test_dataset)
    class_labels = tf.argmax(predictions, axis=1).numpy().astype(int)
    combined_data = zip(unique_IDs[start_idx:end_idx], class_labels)
    # save array into csv file 
    # np.savetxt(f"data_{i+1}.csv", combined_data, delimiter=",", fmt='%i')
    with open(f"data_{my_file_endings[i]}.csv", 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerows(combined_data)
print("Final shape of X_test:" , len(testing_filenames))

30690
Temp shape of data_subset: 2046
Processed subset 1: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 2: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 3: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 4: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 5: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 6: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 7: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 8: 2046
Length of image data:  2046
Size of test_dataset: 256
Temp shape of data_subset: 2046
Processed subset 9: 2046
Length of image data:  2046
Size of test_dataset:

In [None]:
# # Initialize counters
# grayscale_count = 0
# colorful_count = 0

# # List all files in the folder
# files = os.listdir(folder_path)

# # Iterate over each file
# for file in files:
#     # Check if the file is a JPG file
#     if file.endswith('.jpg'):
#         # Construct the full path to the image file
#         image_path = os.path.join(folder_path, file)
        
#         # Read the image using OpenCV
#         image = cv2.imread(image_path)
        
#         # Check if the image was successfully read
#         if image is not None:
#             # Check if the image is grayscale
#             if len(image.shape) < 3:
#                 grayscale_count += 1
#             else:
#                 colorful_count += 1
                
#         else:
#             print(f"Error reading image '{file}'")

# # Output the results
# print("Analysis Results:")
# print(f"Total images: {len(files)}")
# print(f"Grayscale images: {grayscale_count}")
# print(f"Colorful images: {colorful_count}")