Imports

In [None]:
# imports
import glob
import re
from pathlib import Path

# import cv2
import pandas as pd
# import numpy as np
# import matplotlib.pyplot as plt
# from scipy.spatial import distance
# from sklearn.model_selection import train_test_split
from skimage import measure, filters
from skimage.filters import gaussian
import tensorflow as tf
import random
from PIL import Image
# import os
import torchvision.transforms as transforms
from tensorflow.keras.preprocessing.image import ImageDataGenerator

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Augment data

In [None]:
# follow the directory structure specified in the repository 
# downloaded heart16 file from https://data.galaxyzoo.org/#section-7
df_hart16 = pd.read_csv('../data/gz2_hart16.csv')
# print("HART16 Dataset Info:")
# df_hart16.info()

# downloaded images and mapping from https://www.kaggle.com/datasets/jaimetrickz/galaxy-zoo-2-images/data
df_mappings = pd.read_csv('../data/gz2_filename_mapping.csv')
# print("\nMappings Dataset Info:")
# df_mappings.info()

# only keep first 4 columns and debiased data
columns_to_keep = ["dr7objid", "ra", "dec", "gz2_class"] + [col for col in df_hart16.columns if col.endswith('_debiased')]
filtered_hart16 = df_hart16[columns_to_keep]

# merge dataframes on object id
df_merged = pd.merge(df_mappings, filtered_hart16, 
                     left_on='objid',
                     right_on='dr7objid')

# check for null rows
null_rows = df_merged.isnull().any(axis=1)
# get total number of null rows
num_null_rows = null_rows.sum()
# drop null rows
df_merged.dropna(inplace=True)

print(df_merged.shape)

# get list of all images in the image folder
image_files = glob.glob('../data/images_gz2/images/*.jpg')
print("Image count:", len(image_files))

# regex to match numeric image file names
regex = re.compile(r".*[/\\](\d+)\.jpg")

# extract image names
# image name corresponds to assed_id in data table
image_names = []
for img in image_files:
    match = re.search(regex, img)
    if match:
        image_names.append(int(match.group(1)))

# sort and display
image_names.sort()
# print("First 10 image names:", image_names[:10])

print(df_merged.head())

class_counts = df_merged['gz2_class'].value_counts()
print("Number of classes:", class_counts.shape[0])

Augment images

In [None]:
# Create an ImageDataGenerator instance
datagen = ImageDataGenerator(
    rescale=1.0 / 255,             # Normalize pixel values to [0, 1]
    horizontal_flip=True,          # Random horizontal flip
    rotation_range=90,             # Rotate randomly from 0 to 90 degrees
    zoom_range=(1.1, 1.3),         # Random zoom between 1.1x to 1.3x
    brightness_range=(0.98, 1.02), # Adjust brightness
    shear_range=0.2,               # Apply random shear transformations
)

# Load and preprocess an image
from tensorflow.keras.utils import load_img, img_to_array
from tensorflow.keras.preprocessing.image import array_to_img

# Path to image
image_path = "../data/images_gz2/images/223272.jpg"
img = load_img(image_path, target_size=(256, 256))  # Resize to 256x256
img_array = img_to_array(img)  # Convert to numpy array
img_array = img_array.reshape((1,) + img_array.shape)  # Add batch dimension

# Apply transformations and visualise
augmented_images = datagen.flow(img_array, batch_size=1)
for i, batch in enumerate(augmented_images):
    augmented_img = array_to_img(batch[0])  # Convert back to PIL Image
    augmented_img.show()  # Show the processed image
    if i == 4:  # Display 5 images
        break
