In [8]:
import tensorflow as tf
import tensorflow.keras.layers as tfl
import numpy as np
import pandas as pd

In [9]:
#Loading in the data
train_df = pd.read_parquet('/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/train_df.parquet')
val_df = pd.read_parquet('/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/val_df.parquet')
test_df = pd.read_parquet('/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/test_df.parquet')

In [10]:
#Finding out the min and max values for normalization purposes
for df_name, df in [("Train", train_df), ("Validation", val_df), ("Test", test_df)]:
    print(f"\n{df_name} Dataset:")

    for channel in ['red_channel', 'green_channel', 'blue_channel']:
        channel_data = np.concatenate(df[channel].values)
        
        min_val = channel_data.min()
        max_val = channel_data.max()
        
        assert min_val == 0
        assert max_val == 255
        
        print('Expected min of 0 and max of 255')


Train Dataset:
Expected min of 0 and max of 255
Expected min of 0 and max of 255
Expected min of 0 and max of 255

Validation Dataset:
Expected min of 0 and max of 255
Expected min of 0 and max of 255
Expected min of 0 and max of 255

Test Dataset:
Expected min of 0 and max of 255
Expected min of 0 and max of 255
Expected min of 0 and max of 255


In [11]:
#Takes the color channels, combines them into an images, reshapes it to 256x256x3, normalizes, and one-hot encodes the data
def preprocess_image(red, green, blue, label): 
    image = tf.stack([red, green, blue], axis = -1)
    
    #Reshape and normalize the data
    image = tf.reshape(image, (256, 256, 3))
    image = tf.cast(image, tf.float16) / 255.0
    
    #One-hot encoding
    label = tf.one_hot(label, depth = 6)
    
    return image, label

In [12]:
#Creates a TensorFlow dataset from the created dataframes. Applies the preprocessing function to the dataframes. 
#Batches are created that be used for training the model later on

def create_dataset(df, batch_size = 32, shuffle = True): 
    dataset = tf.data.Dataset.from_tensor_slices((
        df['red_channel'].tolist(), 
        df['green_channel'].tolist(), 
        df['blue_channel'].tolist(), 
        df['class'].tolist()
    ))
    
    dataset = dataset.map(preprocess_image, num_parallel_calls= tf.data.AUTOTUNE)
    
    if shuffle: 
        dataset = dataset.shuffle(buffer_size= len(df))
    dataset = dataset.batch(batch_size)
    
    return dataset

In [13]:
#Creating of the datasets incorporating the functions above 
train_dataset = create_dataset(train_df)
val_dataset = create_dataset(val_df, shuffle=False)
test_dataset = create_dataset(test_df, shuffle=False)

In [14]:
#Saving the data for later usage
tf.data.Dataset.save(train_dataset, '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/for_training/train')
tf.data.Dataset.save(val_dataset, '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/for_training/val')
tf.data.Dataset.save(test_dataset, '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS Dataset/data/created_data/for_training/test')