In [6]:
import tensorflow as tf
print("GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

GPUs Available:  1


In [1]:
import tensorflow as tf
# from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Conv2D
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras import mixed_precision
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, average_precision_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
# from tensorflow.keras.mixed_precision import mixed_precision  
from tensorflow.keras.backend import clear_session 

# Enable mixed precision training
mixed_precision.set_global_policy('mixed_float16')
clear_session()

physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

# Dataset path
dataset_path = '/app/data/Datasets/Trashnet-resized'
dataset_name = os.path.basename(dataset_path[18:])
# Parameters
img_shape = (160, 160, 3)
batch_size = 2
num_classes = 6
epochz = 30

# Data generators
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)

train_generator = datagen.flow_from_directory(
    dataset_path,
    target_size=img_shape[:2],
    batch_size=batch_size,
    class_mode='categorical',
    subset='training'
)

validation_generator = datagen.flow_from_directory(
    dataset_path,
    target_size=img_shape[:2],
    batch_size=batch_size,
    class_mode='categorical',
    subset='validation'
)

test_generator = ImageDataGenerator(rescale=1./255).flow_from_directory(
    dataset_path,
    target_size=img_shape[:2],
    batch_size=batch_size,
    class_mode='categorical'
)

# Model
def create_model():
    base_model = tf.keras.applications.MobileNetV2(input_shape=img_shape, include_top=False, weights='imagenet')
    base_model.trainable = False
    model_name = base_model.name

    model = Sequential([
        base_model,
        # Conv2D(32, (3, 3), activation='relu', input_shape=img_shape),
        GlobalAveragePooling2D(),
        Dense(64, activation="relu"),
        Dense(num_classes, activation="softmax")
    ])
    return model, model_name

model, model_name = create_model()
# Compile model
optimizer = Adam(learning_rate=0.00001)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Callbacks
lr_reduction = ReduceLROnPlateau(monitor='val_loss', patience=2, factor=0.5, verbose=1, min_lr=0.00001)
early_stopping = EarlyStopping(patience=5, restore_best_weights=True)

# Train model
try:
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // batch_size,
        validation_data=validation_generator,
        validation_steps=validation_generator.samples // batch_size,
        epochs=epochz,
        callbacks=[lr_reduction, early_stopping],
        verbose=1
    )   
except Exception as e:
    print(f"An error occurred during training: {str(e)}")
    # If an error occurs, try recreating and recompiling the model
    tf.keras.backend.clear_session()
    model = create_model()
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    # Try training again
    history = model.fit(
        train_generator,
        steps_per_epoch=train_generator.samples // batch_size,
        validation_data=validation_generator,
        validation_steps=validation_generator.samples // batch_size,
        epochs=epochz,
        callbacks=[lr_reduction, early_stopping],
        verbose=1
    )

# Evaluate model
loss, accuracy = model.evaluate(test_generator, verbose=False)
y_pred = model.predict(test_generator)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = test_generator.classes

# Metrics
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred_classes, average='weighted')
mAP = average_precision_score(tf.keras.utils.to_categorical(y_true), y_pred, average='weighted')

# Confusion matrix
cm = confusion_matrix(y_true, y_pred_classes)
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot results
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.savefig(f'/app/data/graphs/multi2_{dataset_name}_{epochz}epoch_batch{batch_size}_{model_name}_training_history.png')
plt.close()

plt.figure(figsize=(10, 8))
sns.heatmap(cm_percent, annot=True, fmt='.2f', cmap='Blues')
plt.title('Confusion Matrix (%)')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig(f'/app/data/graphs/multi2_{dataset_name}_{epochz}epoch_batch{batch_size}_{model_name}_confusion_matrix.png')
plt.close()

# Save results
with open('/app/data/results.txt', 'w') as f:
    f.write(f"Test Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall: {recall:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write(f"mAP: {mAP:.4f}\n")

# Save model
model.save("/app/data/trashnet_model.keras")

print("Training and evaluation complete!")

2024-10-10 20:39:48.705207: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-10 20:39:48.715338: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-10 20:39:48.726282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-10 20:39:48.729663: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-10 20:39:48.738243: I tensorflow/core/platform/cpu_feature_guar

Found 2024 images belonging to 6 classes.
Found 503 images belonging to 6 classes.
Found 2527 images belonging to 6 classes.


I0000 00:00:1728592790.432417   21585 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728592790.436339   21585 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728592790.436465   21585 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1728592790.500264   21585 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/30


  self._warn_if_super_not_called()
I0000 00:00:1728592793.358570   21693 service.cc:146] XLA service 0x7a6b44010a60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728592793.358632   21693 service.cc:154]   StreamExecutor device (0): NVIDIA GeForce GTX 1050 Ti, Compute Capability 6.1
2024-10-10 20:39:53.434905: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-10-10 20:39:53.841458: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8906
2024-10-10 20:39:53.964208: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 29.00MiB (30408704 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-10-10 20:39:53.964267: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 26.10MiB (27367936 bytes) from device: CUDA_ERROR_OUT

[1m  13/1012[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8s[0m 9ms/step - accuracy: 0.1855 - loss: 2.0607      

I0000 00:00:1728592795.554861   21693 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m1012/1012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 7ms/step - accuracy: 0.2471 - loss: 1.8137 - val_accuracy: 0.3805 - val_loss: 1.5471 - learning_rate: 1.0000e-05
Epoch 2/30


2024-10-10 20:40:02.272231: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-10-10 20:40:02.272311: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
  self.gen.throw(typ, value, traceback)
2024-10-10 20:40:02.566984: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 25.63MiB (26873600 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-10-10 20:40:02.567030: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 23.07MiB (24186368 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: out of memory
2024-10-10 20:40:02.567053: I external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:1578] failed to allocate 20.76MiB (21767936 bytes) from device: CUDA_ERROR_OUT_OF_MEMORY: ou

An error occurred during training: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/lib/python3.11/runpy.py", line 198, in _run_module_as_main

  File "/usr/lib/python3.11/runpy.py", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/ipykernel_launcher.py", line 18, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 739, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 604, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  Fil

AttributeError: 'tuple' object has no attribute 'compile'