In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
class_mapping = str({'Cloud': 0, 'Dust': 1, 'Haze': 2, 'Land': 3, 'Seaside': 4, 'Smoke': 5})

df = pd.read_parquet('/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/created_data/smoke_data/smoke_data.parquet')
display(df.head())
print(f'\n--- CLASS MAPPING --- \n{class_mapping}\n\n')
print(f'--- DF SHAPE BEFORE AUGMENTATION --- \n{df.shape}')

Unnamed: 0,class,red_channel,green_channel,blue_channel
0,3,"[108, 107, 114, 126, 115, 110, 117, 125, 127, ...","[92, 90, 97, 109, 99, 97, 102, 109, 111, 98, 9...","[72, 72, 76, 85, 77, 75, 80, 86, 88, 79, 80, 8..."
1,3,"[44, 50, 50, 53, 43, 37, 39, 40, 43, 49, 53, 5...","[52, 59, 61, 66, 55, 48, 47, 41, 43, 49, 57, 5...","[44, 50, 48, 51, 43, 40, 43, 48, 54, 61, 60, 4..."
2,3,"[40, 41, 40, 46, 51, 48, 47, 48, 39, 30, 38, 4...","[32, 33, 34, 40, 44, 42, 42, 42, 33, 25, 32, 3...","[20, 21, 23, 28, 32, 31, 30, 30, 23, 15, 19, 2..."
3,3,"[139, 144, 154, 154, 155, 154, 151, 146, 145, ...","[105, 107, 115, 116, 117, 117, 115, 113, 113, ...","[86, 86, 92, 93, 94, 94, 94, 93, 92, 89, 87, 8..."
4,3,"[40, 32, 57, 52, 37, 38, 43, 33, 19, 24, 20, 1...","[39, 33, 56, 53, 37, 38, 42, 33, 19, 21, 18, 1...","[37, 30, 52, 50, 35, 36, 40, 31, 17, 19, 15, 9..."



--- CLASS MAPPING --- 
{'Cloud': 0, 'Dust': 1, 'Haze': 2, 'Land': 3, 'Seaside': 4, 'Smoke': 5}


--- DF SHAPE BEFORE AUGMENTATION --- 
(6225, 4)


In [2]:
def augment_data(df):
    augmented_data = []
    for _, row in df.iterrows():
        original_image = np.stack([row['red_channel'], row['green_channel'], row['blue_channel']], axis=-1)
        original_image = original_image.reshape((256, 256, 3))
        
        # Original image
        augmented_data.append({
            'red_channel': original_image[:,:,0].flatten(),
            'green_channel': original_image[:,:,1].flatten(),
            'blue_channel': original_image[:,:,2].flatten(),
            'class': row['class']
        })
        
        # Mirrored image
        mirrored = np.fliplr(original_image)
        augmented_data.append({
            'red_channel': mirrored[:,:,0].flatten(),
            'green_channel': mirrored[:,:,1].flatten(),
            'blue_channel': mirrored[:,:,2].flatten(),
            'class': row['class']
        })
        
        # Rotations of mirrored image (90, 180, 270 degrees)
        for k in [1, 2, 3]:  # 90, 180, 270 degrees
            rotated = np.rot90(mirrored, k=k)
            augmented_data.append({
                'red_channel': rotated[:,:,0].flatten(),
                'green_channel': rotated[:,:,1].flatten(),
                'blue_channel': rotated[:,:,2].flatten(),
                'class': row['class']
            })
    
    return pd.DataFrame(augmented_data)

In [3]:
augmented_df = augment_data(df)
print(f'\n--- DF SHAPE AFTER AUGMENTATION --- \n{augmented_df.shape}')


--- DF SHAPE AFTER AUGMENTATION --- 
(31125, 4)


In [4]:
from sklearn.model_selection import train_test_split
import os
total_size = augmented_df.shape[0]
train_size = int(total_size * 0.8)
val_size = (total_size - train_size) // 2
test_size = total_size - train_size - val_size

# Verify that all samples are accounted for 
assert train_size + val_size + test_size == total_size
print('Full augmented dataframe size utilized.') 

X = augmented_df[['red_channel','green_channel','blue_channel']]
y = augmented_df['class']

# First split: Creating X_train
X_train, X_val_test, y_train, y_val_test = train_test_split(X, y, train_size=train_size, 
                                                            shuffle=True, stratify=y, random_state=1)

# Second split: create X_val and X_test
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, train_size=val_size, 
                                                shuffle=True, stratify=y_val_test, random_state=1)

train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Assert that the DF's shapes add up
assert train_df.shape[0] + val_df.shape[0] + test_df.shape[0] == augmented_df.shape[0]
print('Dataframes\' sizes add up to total augmented df size.')

# Assert no overlap between train, val and test set
train_idx = set(train_df.index)
val_idx = set(val_df.index)
test_idx = set(test_df.index)

assert len(train_idx.intersection(val_idx)) == 0
assert len(train_idx.intersection(test_idx)) == 0
assert len(val_idx.intersection(test_idx)) == 0

# Assert that all augmented indices are accounted for
all_split_indices = train_idx.union(val_idx).union(test_idx)
assert len(all_split_indices) == augmented_df.shape[0]

print("No index overlap detected.")
print()
print('--- TRAIN - VAL - TEST SIZES ---')
print(f'Train size: {train_df.shape[0]}')
print(f'Validation size: {val_df.shape[0]}')
print(f'Test size: {test_df.shape[0]}')


path = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/created_data/train_val_test_split'

train_df.to_parquet(os.path.join(path, 'train.parquet'))
val_df.to_parquet(os.path.join(path, 'val.parquet'))
test_df.to_parquet(os.path.join(path, 'test.parquet'))

Full augmented dataframe size utilized.
Dataframes' sizes add up to total augmented df size.
No index overlap detected.

--- TRAIN - VAL - TEST SIZES ---
Train size: 24900
Validation size: 3112
Test size: 3113
