# Import Libraries

In [None]:
# Essential Libraries
import os  # File management
import zipfile  # Handling compressed files
from datetime import datetime  # Timestamping

# Data Processing
import pandas as pd
import numpy as np
import random

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Deep Learning (TensorFlow/Keras)
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import RandomFlip, RandomRotation, RandomZoom

# Serialisation
import pickle # Saving/loading Python objects
import joblib

# Set random seed for reproductibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Extract Images

In [None]:
zip_file_path = '/content/drive/MyDrive/packed_fruits/augmented_images_train.zip'

# Directory to extract the zip contents temporarily
extract_to_temp_dir = '/content/augmented_images_train/'

# Create the temporary directory if it doesn't exist
os.makedirs(extract_to_temp_dir, exist_ok=True)

# Unzipping the file to the temporary directory
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_temp_dir)

print(f"Files extracted temporarily to: {extract_to_temp_dir}")

# Import Dataframe

In [None]:
df = pd.read_csv('/content/drive/MyDrive/packed_fruits/20250301_df_19_clean_filtered.csv')

df['file_path'] = df.apply(
    lambda row: f"/content/augmented_images_test/{row['variety']}/{row['file_name']}"
    if row['subset'] == 'test'
    else f"/content/augmented_images_train/{row['variety']}/{row['file_name']}",
    axis=1
)

df = df.drop(columns=['simp_amount', 'rank'])

## Extract train and validation sets

In [None]:
df = df[['file_path', 'label', 'packed', 'crowd', 'amount', 'adjusted_weight', 'subset']]

df = df.rename(columns={'adjusted_weight': 'weight'})

df_train = df[df['subset'] == 'train'].drop('subset', axis=1)
df_val = df[df['subset'] == 'validation'].drop('subset', axis=1)

df_train_2 = df_train.copy()
df_val_2 = df_val.copy()

df_train.head(2)


# Select a Random Sample of the Data

In [None]:
# Before scaling we randomly slice the samples to reduce processing time
# (here it is one because it corresponds to the full sample size)

sample_fraction = 1

df_train_2_sampled = df_train_2.sample(frac=sample_fraction)
df_val_2_sampled = df_val_2.sample(frac=sample_fraction)

train_df = df_train_2_sampled.copy()
val_df = df_val_2_sampled.copy()

# Scaling the Numerical Data

In [None]:
# Compute the mean weight *before* scaling (for zero-centered standardization)
# The mean weight will be used when testing the model *without* structured data (image only).
# Since StandardScaler centers data, using mean_weight is equivalent to inputting 0.

mean_weight = train_df["weight"].mean()

# Define save path for the mean weight
save_path = "/content/drive/MyDrive/packed_fruits/models_keras/scalars/"
mean_weight_filename = "mean_weight.pkl"
full_save_path = os.path.join(save_path, mean_weight_filename)

# Ensure directory exists before saving
os.makedirs(save_path, exist_ok=True)

# Save mean weight for deployment
with open(full_save_path, "wb") as f:
    pickle.dump(mean_weight, f)

print(f"Mean weight saved at: {full_save_path}")
print(f"Mean weight value: {mean_weight}")

# Initialize Standard Scalers
weight_scaler = StandardScaler()
amount_scaler = StandardScaler()

# Fit scalers on training data and transform both train & validation sets
train_weights_scaled = weight_scaler.fit_transform(train_df[["weight"]])
val_weights_scaled = weight_scaler.transform(val_df[["weight"]])

train_amounts_scaled = amount_scaler.fit_transform(train_df[["amount"]])
val_amounts_scaled = amount_scaler.transform(val_df[["amount"]])

# Save the scalers for later use (deployment)
weight_scaler_filename = os.path.join(save_path, "weight_scaler.pkl")
amount_scaler_filename = os.path.join(save_path, "amount_scaler.pkl")

joblib.dump(weight_scaler, weight_scaler_filename)
joblib.dump(amount_scaler, amount_scaler_filename)

print(f"Weight scaler saved at: {weight_scaler_filename}")
print(f"Amount scaler saved at: {amount_scaler_filename}")

# Create TensorFlow Dataset

In [None]:
#TensorFlow expects float32 for numerical inputs (int64 might cause issues).

# Convert file paths to NumPy arrays
train_image_paths = train_df["file_path"].values
val_image_paths = val_df["file_path"].values

# Convert labels & numerical targets to correct data types
train_labels = train_df["label"].values.astype("int64")
val_labels = val_df["label"].values.astype("int64")

# Regression target (fruit count)
train_amounts = train_df["amount"].values.astype("float32")
val_amounts = val_df["amount"].values.astype("float32")

# Binary classification targets
train_packed = train_df["packed"].values.astype("float32")
val_packed = val_df["packed"].values.astype("float32")

train_crowd = train_df["crowd"].values.astype("float32")
val_crowd = val_df["crowd"].values.astype("float32")

# Function to Load and Preprocess Images


In [None]:
# Function to load and preprocess images
def load_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    # image = tf.image.resize(image, (224, 224)) # in case images are not size 224x224
    image = tf.cast(image, tf.float32) / 255.0  # Normalize to [0,1] range
    return image

# Function to load all inputs & outputs into TensorFlow dataset
def load_data(image_path, weight, label, amount, packed, crowd):
    image = load_image(image_path)
    return {
        "image_input": image,
        "structured_input": tf.convert_to_tensor(weight, dtype=tf.float32)  # Structured input must be float32
    }, {
        "fruit_class": tf.convert_to_tensor(label, dtype=tf.int64),  # Classification labels must be int64 (for sparse_categorical_crossentropy)
        "fruit_count": tf.convert_to_tensor(amount, dtype=tf.float32),  # Regression labels must be float32
        "bagged": tf.convert_to_tensor(packed, dtype=tf.float32),  # Binary classification labels must be float32
        "crowded": tf.convert_to_tensor(crowd, dtype=tf.float32)  # Binary classification labels must be float32
    }

# Create train dataset
train_dataset = tf.data.Dataset.from_tensor_slices(
    (train_image_paths, train_weights_scaled.astype(np.float32), train_labels, train_amounts_scaled.astype(np.float32), train_packed, train_crowd)
)
train_dataset = train_dataset.map(load_data).shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)

# Create validation dataset
val_dataset = tf.data.Dataset.from_tensor_slices(
    (val_image_paths, val_weights_scaled.astype(np.float32), val_labels, val_amounts_scaled.astype(np.float32), val_packed, val_crowd)
)
val_dataset = val_dataset.map(load_data).batch(32).prefetch(tf.data.AUTOTUNE)

print("TensorFlow Datasets Created Successfully!")

# Model

In [None]:
# Data Augmentation Layer
data_augmentation = keras.Sequential([
    RandomFlip("horizontal"),  # Randomly flip images
    RandomRotation(0.2),       # Rotate by up to 20%
    RandomZoom(0.1)            # Zoom in/out by 10%
], name="data_augmentation")

# Image Input
image_input = keras.Input(shape=(224, 224, 3), name="image_input")
x = data_augmentation(image_input)  # Augment images before passing to CNN

# Pretrained CNN Backbone (EfficientNetB0)
base_model = keras.applications.EfficientNetB0(include_top=False, input_tensor=image_input, weights='imagenet')

# Feature Extraction
x = layers.GlobalAveragePooling2D()(base_model.output)
x = layers.BatchNormalization()(x)  # Normalize activations
x = layers.Dense(256, activation="relu")(x)
x = layers.Dropout(0.3)(x)  # Reduce overfitting
x = layers.Dense(128, activation="relu")(x)

# Structured Input (Weight)
structured_input = keras.Input(shape=(1,), name="structured_input")
y = layers.Dense(16, activation="relu")(structured_input)
y = layers.Dense(8, activation="relu")(y)

# Merge Image and Structured Features
merged = layers.Concatenate()([x, y])

# Output Layers
outputs = {
    "fruit_class": layers.Dense(19, activation="softmax", name="fruit_class")(merged),  # Multi-class classification
    "fruit_count": layers.Dense(1, activation="linear", name="fruit_count")(merged),  # Regression
    "bagged": layers.Dense(1, activation="sigmoid", name="bagged")(merged),  # Binary classification
    "crowded": layers.Dense(1, activation="sigmoid", name="crowded")(merged),  # Binary classification
}

# Define Model
model = keras.Model(inputs=[image_input, structured_input],
                    outputs=outputs)

# Compile Model
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss={
        "fruit_class": "sparse_categorical_crossentropy",  # Classification
        "fruit_count": tf.keras.losses.Huber(delta=1.0),  # Huber Loss for regression
        "bagged": "binary_crossentropy",  # Binary classification
        "crowded": "binary_crossentropy",  # Binary classification
    },
    metrics={
        "fruit_class": "accuracy",
        "fruit_count": "mae",
        "bagged": "accuracy",
        "crowded": "accuracy"
    }
)

# Model Summary
model.summary()

In [None]:
# Extract timestamp for unique checkpoint filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define save path for model checkpoints
save_path = "/content/drive/MyDrive/packed_fruits/"
os.makedirs(save_path, exist_ok=True)  # Ensure directory exists

# Callbacks for training optimization
reduce_lr = ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.1,
    patience=5,
    min_lr=1e-6,
    verbose=1  # Show messages when learning rate is reduced
)

early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True,
    mode="min",
    verbose=1  # Show message when stopping early
)

model_checkpoint = ModelCheckpoint(
    filepath=os.path.join(save_path, f"best_model_{timestamp}.keras"),  # Dynamic filename
    monitor="val_loss",
    save_best_only=True,
    verbose=1  # Print message when saving
)

# Store all callbacks in a list
callbacks = [early_stopping, reduce_lr, model_checkpoint]

In [None]:
# Train the Model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=70,
    callbacks=callbacks,
    verbose=1
)

print("Model training complete!")

In [None]:
# Generate timestamp for unique filenames
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Define save paths with timestamp
model_path = f"/content/drive/MyDrive/packed_fruits/model_{timestamp}.keras"
history_dir = "/content/drive/MyDrive/packed_fruits/training_history/"
history_filename = f"training_history_{timestamp}.pkl"
history_path = os.path.join(history_dir, history_filename)

# Ensure the history save directory exists
os.makedirs(history_dir, exist_ok=True)

# Save the trained model
model.save(model_path)
print(f"Model saved to: {model_path}")

# Save training history
with open(history_path, "wb") as f:
    pickle.dump(history.history, f)

print(f"Training history saved to: {history_path}")