In [None]:
!pip install tensorflow
!pip install numpy
!pip install matplotlib



In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Mount Google Drive and Extract Dataset

In [None]:
import os
import zipfile

# Define paths
drive_path = "/content/drive/My Drive"  # Adjust to your Google Drive path
zip_path = os.path.join(drive_path, "OCT2017.zip")  # Adjust if your file is named differently
extract_to = "/content"  # Path where data will be extracted

# Extract the dataset if not already extracted
if not os.path.exists(extract_to) or not os.path.exists(os.path.join(extract_to, "train")):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print("Dataset extracted!")
else:
    print("Dataset already extracted.")


Mounted at /content/drive
Dataset extracted!


Import Necessary Libraries

In [None]:
# Import required libraries
import numpy as np
import os
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.utils import shuffle

Define Image Preprocessing and Data Generators

In [None]:
# Image preprocessing parameters
img_height, img_width = 224, 224  # Resize dimensions
batch_size = 32  # Adjust based on memory availability

# Create ImageDataGenerator for preprocessing
datagen = ImageDataGenerator(
    rescale=1.0/255.0  # Normalize pixel values to [0, 1]
)

# Define paths for train, val, and test datasets
train_path = os.path.join(extract_to, "OCT2017/train")
val_path = os.path.join(extract_to, "OCT2017/val")
test_path = os.path.join(extract_to, "OCT2017/test")

# Create data generators
train_generator = datagen.flow_from_directory(
    train_path,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=True
)

val_generator = datagen.flow_from_directory(
    val_path,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)

test_generator = datagen.flow_from_directory(
    test_path,
    target_size=(img_height, img_width),
    batch_size=batch_size,
    class_mode='categorical',
    shuffle=False
)

Found 83484 images belonging to 4 classes.
Found 32 images belonging to 4 classes.
Found 968 images belonging to 4 classes.


Save Data in .npz Format

In [None]:
import numpy as np
import os

# Function to save batches to .npz without loading all batches into memory
def save_batches_to_npz(generator, save_path, dataset_name):
    # Create a .npz file on the fly to store the data
    with open(os.path.join(save_path, f"{dataset_name}_data.npz"), 'wb') as f:
        for i in range(len(generator)):
            # Get a batch of data
            images, labels = generator[i]

            # Append the data and labels for the current batch to the .npz file
            np.savez_compressed(os.path.join(save_path, f"{dataset_name}_batch_{i}.npz"), X=images, y=labels)

            # Print progress
            print(f"Saved batch {i + 1}/{len(generator)}")

    print(f"Saved {dataset_name} data successfully")

# Create directory to save preprocessed data
save_path = "/content/drive/My Drive/preprocessed_data"
os.makedirs(save_path, exist_ok=True)

# Save train, val, and test data
save_batches_to_npz(train_generator, save_path, "train")
save_batches_to_npz(val_generator, save_path, "val")
save_batches_to_npz(test_generator, save_path, "test")

Saved batch 1/2609
Saved batch 2/2609
Saved batch 3/2609
Saved batch 4/2609
Saved batch 5/2609
Saved batch 6/2609
Saved batch 7/2609
Saved batch 8/2609
Saved batch 9/2609
Saved batch 10/2609
Saved batch 11/2609
Saved batch 12/2609
Saved batch 13/2609
Saved batch 14/2609
Saved batch 15/2609
Saved batch 16/2609
Saved batch 17/2609
Saved batch 18/2609
Saved batch 19/2609
Saved batch 20/2609
Saved batch 21/2609
Saved batch 22/2609
Saved batch 23/2609
Saved batch 24/2609
Saved batch 25/2609
Saved batch 26/2609
Saved batch 27/2609
Saved batch 28/2609
Saved batch 29/2609
Saved batch 30/2609
Saved batch 31/2609
Saved batch 32/2609
Saved batch 33/2609
Saved batch 34/2609
Saved batch 35/2609
Saved batch 36/2609
Saved batch 37/2609
Saved batch 38/2609
Saved batch 39/2609
Saved batch 40/2609
Saved batch 41/2609
Saved batch 42/2609
Saved batch 43/2609
Saved batch 44/2609
Saved batch 45/2609
Saved batch 46/2609
Saved batch 47/2609
Saved batch 48/2609
Saved batch 49/2609
Saved batch 50/2609
Saved bat

Verify Saved Data

In [None]:
import numpy as np
import os

# Function to load data in chunks and save the concatenated data
def load_and_save_data_in_chunks(save_path, dataset_name, chunk_size=100):
    X_all = []
    y_all = []
    batch_index = 0

    # Create a directory to save the concatenated data outside the preprocessed_data folder
    concatenated_data_path = os.path.join(os.path.dirname(save_path), f"{dataset_name}_concatenated_data")
    os.makedirs(concatenated_data_path, exist_ok=True)

    while os.path.exists(os.path.join(save_path, f"{dataset_name}_batch_{batch_index}.npz")):
        batch_data = np.load(os.path.join(save_path, f"{dataset_name}_batch_{batch_index}.npz"))
        X_all.append(batch_data["X"])
        y_all.append(batch_data["y"])

        # Process the chunk once it's accumulated enough batches
        if len(X_all) >= chunk_size:
            X_all = np.concatenate(X_all, axis=0)  # Concatenate the chunk of batches
            y_all = np.concatenate(y_all, axis=0)
            print(f"Loaded {dataset_name} chunk: Images shape = {X_all.shape}, Labels shape = {y_all.shape}")

            # Save the concatenated chunk data into a .npz file
            chunk_filename = os.path.join(concatenated_data_path, f"{dataset_name}_chunk_{batch_index // chunk_size}.npz")
            np.savez_compressed(chunk_filename, X=X_all, y=y_all)
            print(f"Saved chunk: {chunk_filename}")

            # Clear the chunk data to save memory
            X_all, y_all = [], []

        batch_index += 1

    # Process the last chunk if any remains
    if X_all:
        X_all = np.concatenate(X_all, axis=0)
        y_all = np.concatenate(y_all, axis=0)
        print(f"Final chunk loaded: Images shape = {X_all.shape}, Labels shape = {y_all.shape}")

        # Save the final chunk
        final_filename = os.path.join(concatenated_data_path, f"{dataset_name}_final_chunk.npz")
        np.savez_compressed(final_filename, X=X_all, y=y_all)
        print(f"Saved final chunk: {final_filename}")

    print(f"All chunks for {dataset_name} have been saved to {concatenated_data_path}")

# Example: Load and save data for train, val, and test incrementally
save_path = "/content/drive/My Drive/preprocessed_data"
load_and_save_data_in_chunks(save_path, "train", chunk_size=100)
load_and_save_data_in_chunks(save_path, "val", chunk_size=100)
load_and_save_data_in_chunks(save_path, "test", chunk_size=100)


Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_0.npz
Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_1.npz
Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_2.npz
Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_3.npz
Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_4.npz
Loaded train chunk: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk: /content/drive/My Drive/train_concatenated_data/train_chunk_5.npz
Loaded train chunk: Images shape = (3200

Conversion to TFRecord

In [None]:
import tensorflow as tf
import numpy as np
import os

# Function to create a tf.train.Example from the data
def create_example(X, y):
    """Create a tf.train.Example from the data."""
    # Ensure X is a tensor and encode it
    X_tensor = tf.convert_to_tensor(X, dtype=tf.uint8)
    image_bytes = tf.io.encode_jpeg(X_tensor).numpy()  # Convert image to bytes

    # Convert y into a list of integers (it could be a vector, so no scalar conversion needed)
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_bytes])),  # Convert image to bytes
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=y.astype(np.int64)))  # Convert label list to int64
    }

    example = tf.train.Example(features=tf.train.Features(feature=feature))
    return example

# Function to convert .npz data to TFRecord
def npz_to_tfrecord(save_path, dataset_name, chunk_size=100):
    # Create a "tfrecord" directory and subdirectories for train, val, and test
    tfrecord_base_path = os.path.join(save_path, "tfrecord")
    os.makedirs(tfrecord_base_path, exist_ok=True)
    dataset_path = os.path.join(tfrecord_base_path, dataset_name)
    os.makedirs(dataset_path, exist_ok=True)

    batch_index = 0
    while os.path.exists(os.path.join(save_path, f"{dataset_name}_concatenated_data", f"{dataset_name}_chunk_{batch_index}.npz")):
        # Load the .npz file
        batch_data = np.load(os.path.join(save_path, f"{dataset_name}_concatenated_data", f"{dataset_name}_chunk_{batch_index}.npz"))
        X_all = batch_data["X"]
        y_all = batch_data["y"]

        print(f"Loaded chunk {batch_index}: Images shape = {X_all.shape}, Labels shape = {y_all.shape}")

        # Create a TFRecord file for this batch inside the appropriate folder
        tfrecord_filename = os.path.join(dataset_path, f"{dataset_name}_chunk_{batch_index}.tfrecord")
        with tf.io.TFRecordWriter(tfrecord_filename) as writer:
            for i in range(len(X_all)):
                example = create_example(X_all[i], y_all[i])  # Ensure the labels are properly passed
                writer.write(example.SerializeToString())  # Serialize the example and write to the TFRecord file

        print(f"Saved chunk {batch_index} to TFRecord: {tfrecord_filename}")
        batch_index += 1

    print(f"All chunks for {dataset_name} have been converted to TFRecord")

# Example: Convert data for train, val, and test into TFRecord format
save_path = "/content/drive/My Drive"  # Update with your actual path
npz_to_tfrecord(save_path, "train", chunk_size=100)
npz_to_tfrecord(save_path, "val", chunk_size=100)
npz_to_tfrecord(save_path, "test", chunk_size=100)


Loaded chunk 0: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 0 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_0.tfrecord
Loaded chunk 1: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 1 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_1.tfrecord
Loaded chunk 2: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 2 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_2.tfrecord
Loaded chunk 3: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 3 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_3.tfrecord
Loaded chunk 4: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 4 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_4.tfrecord
Loaded chunk 5: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Saved chunk 5 to TFRecord: /content/drive/My Drive/tfrecord/train/train_chunk_5.tfrecord
Load

In [None]:
import tensorflow as tf
import numpy as np
import os

# Function to parse a TFRecord example
def _parse_function(proto):
    # Define your `tfrecord` parsing schema
    keys_to_features = {
        'image': tf.io.FixedLenFeature([], tf.string),  # Image is stored as a byte string
        'label': tf.io.FixedLenFeature([4], tf.int64)  # Label is a list of integers (adjust as necessary)
    }

    # Parse the example
    parsed_features = tf.io.parse_single_example(proto, keys_to_features)

    # Decode the image
    image = tf.io.decode_jpeg(parsed_features['image'])
    label = parsed_features['label']

    return image, label

# Function to load data from TFRecord file
def load_tfrecord_data(tfrecord_path):
    dataset = tf.data.TFRecordDataset(tfrecord_path)
    dataset = dataset.map(_parse_function)  # Apply the parsing function
    return dataset

# Function to compare npz and tfrecord data (only first, middle, and last)
def verify_data(npz_path, tfrecord_path):
    # Load .npz data
    batch_data = np.load(npz_path)
    X_npz = batch_data["X"]
    y_npz = batch_data["y"]

    print(f"Loaded .npz: Images shape = {X_npz.shape}, Labels shape = {y_npz.shape}")

    # Load TFRecord data
    dataset = load_tfrecord_data(tfrecord_path)

    # Calculate first, middle, and last index
    indices_to_check = [0, len(X_npz) // 2, len(X_npz) - 1]

    for idx in indices_to_check:
        # Get the .npz image and label
        image_npz = X_npz[idx]
        label_npz = y_npz[idx]

        # Get the .tfrecord image and label
        for i, (image_tfrecord, label_tfrecord) in enumerate(dataset):
            if i == idx:
                # Convert tensor to numpy array
                image_tfrecord = image_tfrecord.numpy()
                label_tfrecord = label_tfrecord.numpy()

                # Compare the images and labels
                if not np.array_equal(image_npz, image_tfrecord):
                    print(f"Image mismatch at index {i}")
                    print(f"Image from .npz: {image_npz}")
                    print(f"Image from TFRecord: {image_tfrecord}")
                    break

                if not np.array_equal(label_npz, label_tfrecord):
                    print(f"Label mismatch at index {i}")
                    print(f"Label from .npz: {label_npz}")
                    print(f"Label from TFRecord: {label_tfrecord}")
                    break

    print("Verification completed.")

# Example: Verify the first, middle, and last chunk of data
npz_path = "/content/drive/My Drive/train_concatenated_data/train_chunk_0.npz"
tfrecord_path = "/content/drive/My Drive/tfrecord/train/train_chunk_0.tfrecord"
verify_data(npz_path, tfrecord_path)

Loaded .npz: Images shape = (3200, 224, 224, 3), Labels shape = (3200, 4)
Image mismatch at index 0
Image from .npz: [[[0.14117648 0.14117648 0.14117648]
  [0.20000002 0.20000002 0.20000002]
  [0.13725491 0.13725491 0.13725491]
  ...
  [1.         1.         1.        ]
  [1.         1.         1.        ]
  [0.9921569  0.9921569  0.9921569 ]]

 [[0.16862746 0.16862746 0.16862746]
  [0.20784315 0.20784315 0.20784315]
  [0.03529412 0.03529412 0.03529412]
  ...
  [1.         1.         1.        ]
  [1.         1.         1.        ]
  [0.9960785  0.9960785  0.9960785 ]]

 [[0.19215688 0.19215688 0.19215688]
  [0.15294118 0.15294118 0.15294118]
  [0.1254902  0.1254902  0.1254902 ]
  ...
  [0.9960785  0.9960785  0.9960785 ]
  [0.9960785  0.9960785  0.9960785 ]
  [1.         1.         1.        ]]

 ...

 [[1.         1.         1.        ]
  [1.         1.         1.        ]
  [1.         1.         1.        ]
  ...
  [0.03921569 0.03921569 0.03921569]
  [0.01960784 0.01960784 0.019607

In [None]:
import tensorflow as tf

# Path to your TFRecord file
tfrecord_file_path = "/content/drive/MyDrive/tfrecord/test/test_chunk_0.tfrecord"  # Replace with the actual path

# Function to parse TFRecord
def parse_tfrecord_fn(example):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([4], tf.int64),  # Assuming 4 classes from the npz structure
    }
    parsed_example = tf.io.parse_single_example(example, feature_description)

    # Decode image (optional: check pixel values if needed)
    image = tf.image.decode_jpeg(parsed_example["image"], channels=3)
    label = parsed_example["label"]
    return image, label

# Read the TFRecord and parse examples
raw_dataset = tf.data.TFRecordDataset(tfrecord_file_path)
parsed_dataset = raw_dataset.map(parse_tfrecord_fn)

# Check sample labels
for image, label in parsed_dataset.take(5):  # Inspect first 5 examples
    print("Label:", label.numpy())

Label: [1 0 0 0]
Label: [1 0 0 0]
Label: [1 0 0 0]
Label: [1 0 0 0]
Label: [1 0 0 0]
