# **DATA PREP**

Taking 300 collected images from JMARS and using aggressive augmentation to transform them x10-15 and resizing to a uniform size.


HUMAN

In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

**RESIZE**

To 128x128 pixels

In [None]:
import os
from PIL import Image
from tqdm import tqdm

# Set up input and output paths
base_input_path = '/content/drive/MyDrive/Dissertation/DATA'
base_output_path = '/content/drive/MyDrive/Dissertation/Resized2'
classes = ['Impact', 'Volcanic']

# Resize to 128x128 pixels
target_size = (128, 128)

for cls in classes:
    input_dir = os.path.join(base_input_path, cls)
    output_dir = os.path.join(base_output_path, cls)
    os.makedirs(output_dir, exist_ok=True)

    for filename in tqdm(os.listdir(input_dir), desc=f"Resizing {cls}"):
        if not filename.lower().endswith('.png'):
            continue
        input_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        img = Image.open(input_path).convert('RGB')  # Ensures 3-channel
        img_resized = img.resize(target_size, Image.BILINEAR)
        img_resized.save(output_path)


**AUGMENTATION**

300 images to 3000

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array
import random
from PIL import ImageFilter

# Augmentation settings
datagen = ImageDataGenerator(
    rotation_range=360,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.2,
    brightness_range=[0.8, 1.2],
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

# Optional: add slight noise
def add_noise(image_array, noise_level=10):
    noise = np.random.normal(0, noise_level, image_array.shape).astype(np.float32)
    noisy = image_array.astype(np.float32) + noise
    return np.clip(noisy, 0, 255).astype(np.uint8)

# Optional: add blur
def apply_blur(pil_image, radius=1):
    return pil_image.filter(ImageFilter.GaussianBlur(radius))

# Paths
resize_input_path = '/content/drive/MyDrive/Dissertation/Resized2'
augment_output_path = '/content/drive/MyDrive/Dissertation/Augmented2'
n_aug_per_image = 10

for cls in classes:
    input_dir = os.path.join(resize_input_path, cls)
    output_dir = os.path.join(augment_output_path, cls)
    os.makedirs(output_dir, exist_ok=True)

    for file in tqdm(os.listdir(input_dir), desc=f"Augmenting {cls}"):
        if not file.lower().endswith('.png'):
            continue

        img_path = os.path.join(input_dir, file)
        img = Image.open(img_path).convert('RGB')
        x = img_to_array(img).reshape((1,) + img.size + (3,))

        i = 0
        for batch in datagen.flow(x, batch_size=1):
            aug = batch[0].astype('uint8')
            aug_img = Image.fromarray(aug)

            # Optional: apply noise and blur
            if random.random() < 0.5:
                aug = add_noise(np.array(aug_img))
                aug_img = Image.fromarray(aug)

            if random.random() < 0.5:
                aug_img = apply_blur(aug_img, radius=1)

            aug_filename = f"{file[:-4]}_aug_{i}.png"
            aug_img.save(os.path.join(output_dir, aug_filename))

            i += 1
            if i >= n_aug_per_image:
                break


In [None]:
#Sort Folders in google drive

import os
import shutil
import random

# Source directory
source_dir = '/content/drive/MyDrive/Dissertation/Augmented2'

# Destination directory
base_output = '/content/drive/MyDrive/Dissertation/FinalDataset2'

# Define subfolders
splits = ['train', 'val', 'test']
classes = ['Impact', 'Volcanic']

# Create directory structure
for split in splits:
    for cls in classes:
        path = os.path.join(base_output, split, cls)
        os.makedirs(path, exist_ok=True)

# Split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

for cls in classes:
    class_path = os.path.join(source_dir, cls)
    images = [f for f in os.listdir(class_path) if f.endswith('.png')]
    random.shuffle(images)

    total = len(images)
    train_cut = int(train_ratio * total)
    val_cut = int(val_ratio * total)

    split_data = {
        'train': images[:train_cut],
        'val': images[train_cut:train_cut + val_cut],
        'test': images[train_cut + val_cut:]
    }

    for split in splits:
        for img in split_data[split]:
            src = os.path.join(class_path, img)
            dst = os.path.join(base_output, split, cls, img)
            shutil.copyfile(src, dst)


In [None]:
import os

base_dir = '/content/drive/MyDrive/Dissertation/FinalDataset2'

for split in ['train', 'val', 'test']:
    print(f"\n{split.upper()}:")
    for cls in ['Impact', 'Volcanic']:
        path = os.path.join(base_dir, split, cls)
        count = len([f for f in os.listdir(path) if f.endswith('.png') or f.endswith('.jpg')])
        print(f"  {cls}: {count} images")


In [None]:
!find '/content/drive/MyDrive/Dissertation/DATA/Impact' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/DATA/Volcanic' -name '*.png' | wc -l

!find '/content/drive/MyDrive/Dissertation/Resized/Impact' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/Resized/Volcanic' -name '*.png' | wc -l


!find '/content/drive/MyDrive/Dissertation/Augmented/Impact' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/Augmented/Volcanic' -name '*.png' | wc -l

!find '/content/drive/MyDrive/Dissertation/FinalDataset/train/Volcanic' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/FinalDataset/train/Impact' -name '*.png' | wc -l


!find '/content/drive/MyDrive/Dissertation/FinalDataset/val/Volcanic' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/FinalDataset/val/Impact' -name '*.png' | wc -l


!find '/content/drive/MyDrive/Dissertation/FinalDataset/test/Volcanic' -name '*.png' | wc -l
!find '/content/drive/MyDrive/Dissertation/FinalDataset/test/Impact' -name '*.png' | wc -l




In [None]:
from PIL import Image
import numpy as np, glob, os

def avg_hash(img, hash_size=8):
    img = img.resize((hash_size, hash_size)).convert('L')
    arr = np.array(img)
    return tuple((arr > arr.mean()).flatten().astype(int))

root = '/content/drive/MyDrive/Dissertation/FinalDataset2'  # or your dataset path
files = []
for split in ['train','val','test']:
    for cls in ['Impact','Volcanic']:
        path = os.path.join(root, split, cls)
        files += glob.glob(os.path.join(path, '*.*'))

hash_map = {}
for f in files:
    try:
        h = avg_hash(Image.open(f))
        hash_map.setdefault(h, []).append(f)
    except Exception as e:
        pass

collisions = [lst for lst in hash_map.values() if len(lst) > 1]
print("Groups of visually identical/near-duplicate images:", len(collisions))
for grp in collisions[:10]:
    print(grp)


In [None]:
import os, glob, re
from collections import defaultdict

root = '/content/drive/MyDrive/Dissertation/FinalDataset2'  # replace with your actual FinalDataset path (FinalDataset, FinalDataset2, etc.)

splits = ['train','val','test']
classes = ['Impact','Volcanic']

# Map prefix -> set(splits) and sample files
prefix_splits = defaultdict(set)
prefix_examples = defaultdict(list)

def get_prefix(fn):
    # adjust if your naming is different; this expects 'prefix_aug_X.ext' or 'prefix.ext'
    base = os.path.basename(fn)
    name = os.path.splitext(base)[0]
    if '_aug_' in name:
        return name.split('_aug_')[0]
    # if you used other separators, add here
    return name

for split in splits:
    for cls in classes:
        folder = os.path.join(root, split, cls)
        if not os.path.exists(folder):
            continue
        for f in glob.glob(os.path.join(folder, '*.*')):
            p = get_prefix(f)
            prefix_splits[p].add(split)
            if len(prefix_examples[p]) < 5:
                prefix_examples[p].append((split, f))

# find prefixes that appear in multiple splits
multi_split_prefixes = [p for p, sset in prefix_splits.items() if len(sset) > 1]
print("Total unique original prefixes found:", len(prefix_splits))
print("Prefixes appearing in more than one split:", len(multi_split_prefixes))
if len(multi_split_prefixes) > 0:
    print("Examples (prefix -> splits -> sample files):")
    for p in multi_split_prefixes[:20]:
        print(p, "->", prefix_splits[p])
        for ex in prefix_examples[p]:
            print("   ", ex)
else:
    print("No original-prefix appears in multiple splits. Good â€” no grouped-sibling leakage detected.")


In [None]:
from collections import Counter
for split in ['train','val','test']:
    counts = Counter()
    for cls in ['Impact','Volcanic']:
        folder = os.path.join(root, split, cls)
        counts[cls] = len(glob.glob(os.path.join(folder, '*.*')))
    print(split, counts)
