## Pickling Dataset

### Import Libraries

In [None]:
import os
from PIL import Image
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import tensorflow as tf
import pickle
import psutil
import resource
import tracemalloc

In [None]:

# Check TensorFlow version
print("TensorFlow Version:", tf.__version__)

# List all physical devices
physical_devices = tf.config.list_physical_devices()
print("All Physical Devices:", physical_devices)

# List GPU devices
gpus = tf.config.list_physical_devices('GPU')
print("GPUs:", gpus)

# Test TensorFlow GPU availability
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    print("GPU device not found. Please ensure that your machine has a compatible GPU and that TensorFlow is set up to use it.")
else:
    print('GPU found at {}. Using the L4 GPU hardware accelerator in Google Colab Pro.'.format(device_name))

# Check if TensorFlow can access the GPU
try:
    with tf.device('/device:GPU:0'):
        print("TensorFlow can access the L4 GPU hardware accelerator in Google Colab Pro")
except RuntimeError as e:
    print("Error accessing the L4 GPU hardware accelerator in Google Colab Pro with TensorFlow:", e)


TensorFlow Version: 2.15.0
All Physical Devices: [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
GPU found at /device:GPU:0. Using the L4 GPU hardware accelerator in Google Colab Pro.
TensorFlow can access the L4 GPU hardware accelerator in Google Colab Pro


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
%cd /content/gdrive/My Drive/Masters Project/Final Dataset
!ls

/content/gdrive/My Drive/Masters Project/Final Dataset
Testing_Output	Training_Output


In [None]:
TEST_DIR = "//content//gdrive//My Drive//Masters Project//Final Dataset//Testing_Output" # test data folder
TRAIN_DIR = "//content//gdrive//My Drive//Masters Project//Final Dataset//Training_Output" # train data folder
IMG_SIZE = 224 # image size
CATEGORIES = ["no_tumor_Output","glioma_tumor_Output","meningioma_tumor_Output","pituitary_tumor_Output"]

### Training Set

In [None]:
%%time

training_data = []

def create_training_data():
    for category in CATEGORIES:
        path = os.path.join(TRAIN_DIR, category)  # create path
        class_num = CATEGORIES.index(category)  # get the classification

        for img in tqdm(os.listdir(path)):
            # iterate over each image per category
            img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)  # convert to array
            if img_array is not None:
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size
                training_data.append([new_array, class_num])  # add this to our training_data

    random.shuffle(training_data)

create_training_data()
print(f"Total training samples: {len(training_data)}")

# Separate features and labels
X_train = np.array([i[0] for i in training_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y_train = np.array([i[1] for i in training_data])

# Save the arrays using pickle
with open("X_train.pickle", "wb") as pickle_out:
    pickle.dump(X_train, pickle_out)

with open("Y_train.pickle", "wb") as pickle_out:
    pickle.dump(Y_train, pickle_out)

print("Data saved successfully.")

# Functions to print resource usage
tracemalloc.start()

def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"RSS: {mem_info.rss / 1024 ** 2:.2f} MB")
    print(f"VMS: {mem_info.vms / 1024 ** 2:.2f} MB")

def print_cpu_usage():
    print(f"CPU Usage: {psutil.cpu_percent(interval=1)}%")

def print_disk_usage():
    disk_usage = psutil.disk_usage('/')
    print(f"Total Disk Space: {disk_usage.total / 1024 ** 3:.2f} GB")
    print(f"Used Disk Space: {disk_usage.used / 1024 ** 3:.2f} GB")
    print(f"Disk Usage: {disk_usage.percent}%")

def print_resource_usage():
    usage = resource.getrusage(resource.RUSAGE_SELF)
    print(f"User CPU time: {usage.ru_utime:.2f} seconds")
    print(f"System CPU time: {usage.ru_stime:.2f} seconds")

def print_tracemalloc_usage():
    current, peak = tracemalloc.get_traced_memory()
    print(f"Current Memory Usage: {current / 1024 ** 2:.2f} MB")
    print(f"Peak Memory Usage: {peak / 1024 ** 2:.2f} MB")

print_memory_usage()
print_cpu_usage()
print_disk_usage()
print_resource_usage()
print_tracemalloc_usage()


100%|██████████| 11099/11099 [04:26<00:00, 41.72it/s] 
100%|██████████| 11584/11584 [02:45<00:00, 69.94it/s] 
100%|██████████| 11527/11527 [02:42<00:00, 71.10it/s] 
100%|██████████| 11601/11601 [02:48<00:00, 68.77it/s] 


Total training samples: 45811
Data saved successfully.
RSS: 15610.82 MB
VMS: 31129.63 MB
CPU Usage: 5.0%
Total Disk Space: 201.23 GB
Used Disk Space: 34.48 GB
Disk Usage: 17.1%
User CPU time: 303.96 seconds
System CPU time: 61.20 seconds
Current Memory Usage: 13527.39 MB
Peak Memory Usage: 25087.20 MB
CPU times: user 45.9 s, sys: 17.5 s, total: 1min 3s
Wall time: 13min 11s


### Testing Set

In [None]:
%%time

testing_data = []

def create_testing_data():
    for category in CATEGORIES:
        path = os.path.join(TEST_DIR, category)  # create path
        class_num = CATEGORIES.index(category)  # get the classification

        for img in tqdm(os.listdir(path)):
            # iterate over each image per category
            img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_COLOR)  # convert to array
            if img_array is not None:
                new_array = cv2.resize(img_array, (IMG_SIZE, IMG_SIZE))  # resize to normalize data size
                testing_data.append([new_array, class_num])  # add this to our testing_data

    random.shuffle(testing_data)

create_testing_data()
print(f"Total testing samples: {len(testing_data)}")

# Separate features and labels
X_test = np.array([i[0] for i in testing_data]).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
Y_test = np.array([i[1] for i in testing_data])

# Save the arrays using pickle
with open("X_test.pickle", "wb") as pickle_out:
    pickle.dump(X_test, pickle_out)

with open("Y_test.pickle", "wb") as pickle_out:
    pickle.dump(Y_test, pickle_out)

print("Data saved successfully.")

# Functions to print resource usage
tracemalloc.start()

def print_memory_usage():
    process = psutil.Process(os.getpid())
    mem_info = process.memory_info()
    print(f"RSS: {mem_info.rss / 1024 ** 2:.2f} MB")
    print(f"VMS: {mem_info.vms / 1024 ** 2:.2f} MB")

def print_cpu_usage():
    print(f"CPU Usage: {psutil.cpu_percent(interval=1)}%")

def print_disk_usage():
    disk_usage = psutil.disk_usage('/')
    print(f"Total Disk Space: {disk_usage.total / 1024 ** 3:.2f} GB")
    print(f"Used Disk Space: {disk_usage.used / 1024 ** 3:.2f} GB")
    print(f"Disk Usage: {disk_usage.percent}%")

def print_resource_usage():
    usage = resource.getrusage(resource.RUSAGE_SELF)
    print(f"User CPU time: {usage.ru_utime:.2f} seconds")
    print(f"System CPU time: {usage.ru_stime:.2f} seconds")

def print_tracemalloc_usage():
    current, peak = tracemalloc.get_traced_memory()
    print(f"Current Memory Usage: {current / 1024 ** 2:.2f} MB")
    print(f"Peak Memory Usage: {peak / 1024 ** 2:.2f} MB")

print_memory_usage()
print_cpu_usage()
print_disk_usage()
print_resource_usage()
print_tracemalloc_usage()


100%|██████████| 1496/1496 [00:36<00:00, 41.42it/s] 
100%|██████████| 1427/1427 [00:27<00:00, 51.80it/s] 
100%|██████████| 1637/1637 [00:40<00:00, 40.68it/s] 
100%|██████████| 1064/1064 [00:20<00:00, 51.37it/s] 


Total testing samples: 5624
Data saved successfully.
RSS: 17224.85 MB
VMS: 32743.30 MB
CPU Usage: 7.4%
Total Disk Space: 201.23 GB
Used Disk Space: 35.33 GB
Disk Usage: 17.6%
User CPU time: 310.54 seconds
System CPU time: 63.90 seconds
Current Memory Usage: 15143.38 MB
Peak Memory Usage: 25087.20 MB
CPU times: user 6.56 s, sys: 2.7 s, total: 9.26 s
Wall time: 2min 47s


In [None]:
# Path to the files in your Google Drive
pathp = "/content/drive/My Drive/Masters Project/Final Dataset/"

# List the contents of the directory to verify the files
print("Contents of the directory:")
print(os.listdir(pathp))

Contents of the directory:
['Training_Output', 'Testing_Output', 'X_train.pickle', 'Y_train.pickle', 'X_test.pickle', 'Y_test.pickle']
