In [1]:
import os
import numpy as np
import h5py
from collections import Counter
from sklearn.model_selection import train_test_split

# Load the dataset in smaller chunks
output_hdf5_path = 'final_datasetv3AllRecords.h5'
chunk_size = 10000  # Define a chunk size that fits into memory
max_chunks = 10  # Limit the number of chunks to process

# Lists to store chunks
X_segments_chunks = []
y_labels_chunks = []
X_rp_images_chunks = []
X_gaf_images_chunks = []
X_mtf_images_chunks = []

# Open the HDF5 file and read the dataset in chunks
with h5py.File(output_hdf5_path, 'r') as hdf5_file:
    total_samples = hdf5_file['segments'].shape[0]

    # Read data in chunks
    for i in range(0, total_samples, chunk_size):
        if len(X_segments_chunks) >= max_chunks:
            break
        X_segments_chunks.append(hdf5_file['segments'][i:i+chunk_size])
        y_labels_chunks.append(hdf5_file['labels'][i:i+chunk_size])
        X_rp_images_chunks.append(hdf5_file['rp_images'][i:i+chunk_size])
        X_gaf_images_chunks.append(hdf5_file['gaf_images'][i:i+chunk_size])
        X_mtf_images_chunks.append(hdf5_file['mtf_images'][i:i+chunk_size])

# Concatenate the chunks into a single array
X_segments = np.concatenate(X_segments_chunks, axis=0)
y_labels = np.concatenate(y_labels_chunks, axis=0)
X_rp_images = np.concatenate(X_rp_images_chunks, axis=0)
X_gaf_images = np.concatenate(X_gaf_images_chunks, axis=0)
X_mtf_images = np.concatenate(X_mtf_images_chunks, axis=0)

# Exclude the VEB class (Label 4)
exclude_class = 4
included_indices = np.where(y_labels != exclude_class)[0]

X_segments = X_segments[included_indices]
y_labels = y_labels[included_indices]
X_rp_images = X_rp_images[included_indices]
X_gaf_images = X_gaf_images[included_indices]
X_mtf_images = X_mtf_images[included_indices]

# Ensure we have a sufficient number of samples for each class
class_counts = Counter(y_labels)
total_counts = sum(class_counts.values())

# Calculate the proportions of each class
class_ratios = {cls: count / total_counts for cls, count in class_counts.items()}

# Set the number of samples for the smallest class
min_samples_per_class = 100  # Adjust this number if needed

# Calculate the number of samples for each class based on the original ratios
balanced_counts = {cls: int(min_samples_per_class / class_ratios[cls]) for cls in class_ratios}

# Create a balanced subset
balanced_indices = []
for class_label, count in balanced_counts.items():
    class_indices = np.where(y_labels == class_label)[0]
    selected_indices = np.random.choice(class_indices, min(len(class_indices), count), replace=False)
    balanced_indices.extend(selected_indices)

# Select the balanced subset
X_rp_subset = X_rp_images[balanced_indices]
X_gaf_subset = X_gaf_images[balanced_indices]
X_mtf_subset = X_mtf_images[balanced_indices]
y_subset = y_labels[balanced_indices]

# Split the data into training, validation, and test sets
X_rp_train, X_rp_temp, X_gaf_train, X_gaf_temp, X_mtf_train, X_mtf_temp, y_train, y_temp = train_test_split(
    X_rp_subset, X_gaf_subset, X_mtf_subset, y_subset, test_size=0.3, stratify=y_subset, random_state=42)
X_rp_val, X_rp_test, X_gaf_val, X_gaf_test, X_mtf_val, X_mtf_test, y_val, y_test = train_test_split(
    X_rp_temp, X_gaf_temp, X_mtf_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Save the split data
save_dir = './split_datav4'
os.makedirs(save_dir, exist_ok=True)
np.save(os.path.join(save_dir, 'X_rp_train.npy'), X_rp_train)
np.save(os.path.join(save_dir, 'X_gaf_train.npy'), X_gaf_train)
np.save(os.path.join(save_dir, 'X_mtf_train.npy'), X_mtf_train)
np.save(os.path.join(save_dir, 'y_train.npy'), y_train)

np.save(os.path.join(save_dir, 'X_rp_val.npy'), X_rp_val)
np.save(os.path.join(save_dir, 'X_gaf_val.npy'), X_gaf_val)
np.save(os.path.join(save_dir, 'X_mtf_val.npy'), X_mtf_val)
np.save(os.path.join(save_dir, 'y_val.npy'), y_val)

np.save(os.path.join(save_dir, 'X_rp_test.npy'), X_rp_test)
np.save(os.path.join(save_dir, 'X_gaf_test.npy'), X_gaf_test)
np.save(os.path.join(save_dir, 'X_mtf_test.npy'), X_mtf_test)
np.save(os.path.join(save_dir, 'y_test.npy'), y_test)

print(f"Data successfully split and saved in directory: {save_dir}")

# Check the number of samples per class in the training, validation, and test sets
def count_samples_per_class(y):
    class_counts = Counter(y)
    for cls, count in class_counts.items():
        print(f"Class {cls}: {count} samples")

print("Number of samples per class in the training set:")
count_samples_per_class(y_train)

print("\nNumber of samples per class in the validation set:")
count_samples_per_class(y_val)

print("\nNumber of samples per class in the test set:")
count_samples_per_class(y_test)


: 

In [2]:
import os
import numpy as np
import h5py
from collections import Counter
from sklearn.model_selection import train_test_split

# Load the dataset in smaller chunks
output_hdf5_path = 'final_datasetv3AllRecords.h5'
chunk_size = 10000  # Define a chunk size that fits into memory
max_chunks = 10  # Limit the number of chunks to process

# Lists to store chunks
X_segments_chunks = []
y_labels_chunks = []
X_rp_images_chunks = []
X_gaf_images_chunks = []
X_mtf_images_chunks = []

# Open the HDF5 file and read the dataset in chunks
with h5py.File(output_hdf5_path, 'r') as hdf5_file:
    total_samples = hdf5_file['segments'].shape[0]

    # Read data in chunks
    for i in range(0, total_samples, chunk_size):
        if len(X_segments_chunks) >= max_chunks:
            break
        X_segments_chunks.append(hdf5_file['segments'][i:i+chunk_size])
        y_labels_chunks.append(hdf5_file['labels'][i:i+chunk_size])
        X_rp_images_chunks.append(hdf5_file['rp_images'][i:i+chunk_size])
        X_gaf_images_chunks.append(hdf5_file['gaf_images'][i:i+chunk_size])
        X_mtf_images_chunks.append(hdf5_file['mtf_images'][i:i+chunk_size])

# Concatenate the chunks into a single array
X_segments = np.concatenate(X_segments_chunks, axis=0)
y_labels = np.concatenate(y_labels_chunks, axis=0)
X_rp_images = np.concatenate(X_rp_images_chunks, axis=0)
X_gaf_images = np.concatenate(X_gaf_images_chunks, axis=0)
X_mtf_images = np.concatenate(X_mtf_images_chunks, axis=0)

# Exclude the VEB class (Label 4)
exclude_class = 4
included_indices = np.where(y_labels != exclude_class)[0]

X_segments = X_segments[included_indices]
y_labels = y_labels[included_indices]
X_rp_images = X_rp_images[included_indices]
X_gaf_images = X_gaf_images[included_indices]
X_mtf_images = X_mtf_images[included_indices]

# Ensure we have a sufficient number of samples for each class
class_counts = Counter(y_labels)
total_counts = sum(class_counts.values())

# Calculate the proportions of each class
class_ratios = {cls: count / total_counts for cls, count in class_counts.items()}

# Set the number of samples for the smallest class
min_samples_per_class = 300  # Increase this number to ensure more samples per class

# Calculate the number of samples for each class based on the original ratios
balanced_counts = {cls: int(min_samples_per_class / class_ratios[cls]) for cls in class_ratios}

# Create a balanced subset
balanced_indices = []
for class_label, count in balanced_counts.items():
    class_indices = np.where(y_labels == class_label)[0]
    selected_indices = np.random.choice(class_indices, min(len(class_indices), count), replace=False)
    balanced_indices.extend(selected_indices)

# Select the balanced subset
X_rp_subset = X_rp_images[balanced_indices]
X_gaf_subset = X_gaf_images[balanced_indices]
X_mtf_subset = X_mtf_images[balanced_indices]
y_subset = y_labels[balanced_indices]

# Split the data into training, validation, and test sets
X_rp_train, X_rp_temp, X_gaf_train, X_gaf_temp, X_mtf_train, X_mtf_temp, y_train, y_temp = train_test_split(
    X_rp_subset, X_gaf_subset, X_mtf_subset, y_subset, test_size=0.3, stratify=y_subset, random_state=42)
X_rp_val, X_rp_test, X_gaf_val, X_gaf_test, X_mtf_val, X_mtf_test, y_val, y_test = train_test_split(
    X_rp_temp, X_gaf_temp, X_mtf_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Save the split data
save_dir = './split_datav4'
os.makedirs(save_dir, exist_ok=True)
np.save(os.path.join(save_dir, 'X_rp_train.npy'), X_rp_train)
np.save(os.path.join(save_dir, 'X_gaf_train.npy'), X_gaf_train)
np.save(os.path.join(save_dir, 'X_mtf_train.npy'), X_mtf_train)
np.save(os.path.join(save_dir, 'y_train.npy'), y_train)

np.save(os.path.join(save_dir, 'X_rp_val.npy'), X_rp_val)
np.save(os.path.join(save_dir, 'X_gaf_val.npy'), X_gaf_val)
np.save(os.path.join(save_dir, 'X_mtf_val.npy'), X_mtf_val)
np.save(os.path.join(save_dir, 'y_val.npy'), y_val)

np.save(os.path.join(save_dir, 'X_rp_test.npy'), X_rp_test)
np.save(os.path.join(save_dir, 'X_gaf_test.npy'), X_gaf_test)
np.save(os.path.join(save_dir, 'X_mtf_test.npy'), X_mtf_test)
np.save(os.path.join(save_dir, 'y_test.npy'), y_test)

print(f"Data successfully split and saved in directory: {save_dir}")

# Check the number of samples per class in the training, validation, and test sets
def count_samples_per_class(y):
    class_counts = Counter(y)
    for cls, count in class_counts.items():
        print(f"Class {cls}: {count} samples")

print("Number of samples per class in the training set:")
count_samples_per_class(y_train)

print("\nNumber of samples per class in the validation set:")
count_samples_per_class(y_val)

print("\nNumber of samples per class in the test set:")
count_samples_per_class(y_test)


Data successfully split and saved in directory: ./split_datav4
Number of samples per class in the training set:
Class 1: 2996 samples
Class 0: 1947 samples
Class 3: 234 samples
Class 2: 561 samples

Number of samples per class in the validation set:
Class 1: 642 samples
Class 0: 417 samples
Class 2: 120 samples
Class 3: 51 samples

Number of samples per class in the test set:
Class 1: 642 samples
Class 0: 417 samples
Class 2: 121 samples
Class 3: 50 samples
