In [1]:
from astropy.table import Table
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import os
import sys
IN_COLAB = 'google.colab' in sys.modules

In [2]:
if IN_COLAB:
    root_path = "/content/"+iqa_root
else:
    root_path = "/home/mike/git/computational_astro/astro_iqa"

data_path = "data/processed"
catalog_name = "som_objects_catalog.hdf5"

# Read the catalog with pandas
filename = os.path.join(root_path, data_path, catalog_name)
print(filename)
columns = ["OBJECT_ID", "FITS_ID", "CCD_ID", "ISO0", "BACKGROUND", "ELLIPTICITY", "ELONGATION", "CLASS_STAR", "FLAGS", "EXPTIME"]
# catalog = pd.read_hdf(filename, columns=columns)
catalog = Table.read(filename, path="som_catalog", format="hdf5")

print("Overview of the catalog:")
print(catalog.info())

/home/mike/git/computational_astro/astro_iqa/data/processed/som_objects_catalog.hdf5
Overview of the catalog:
<Table length=2104028>
    name     dtype 
----------- -------
    FITS_ID bytes12
     CCD_ID   uint8
  OBJECT_ID bytes32
       ISO0 float32
 BACKGROUND float32
ELLIPTICITY float32
 ELONGATION float32
 CLASS_STAR float32
      FLAGS   int16
    EXPTIME float32
    X_IMAGE float32
    Y_IMAGE float32
None


In [3]:
catalog_df = catalog.to_pandas()

# Select the columns for the definition of fits/objects mapping
mapping_names = ["OBJECT_ID", "FITS_ID", "CCD_ID"]
catalog_df = catalog_df[mapping_names]

# Convert the OBJECT_ID to string
catalog_df["OBJECT_ID"] = catalog_df["OBJECT_ID"].apply(lambda x: x.decode('utf-8'))
catalog_df["FITS_ID"] = catalog_df["FITS_ID"].apply(lambda x: x.decode('utf-8'))

print(catalog_df.head())
print(catalog_df[500000:500005])

                          OBJECT_ID   FITS_ID  CCD_ID
0  057970e69b654b1ea6066ff6e2ceb89a  1013974p       1
1  fe42a43758b8437eb89fb3f4c384f708  1013974p       1
2  e0ff049eb93645f68b640c94e865a079  1013974p       1
3  fdc5c1ff1e9048b09c97c55942e9a446  1013974p       1
4  a360c4f1d508437b9ae022fb1334d1cb  1013974p       1
                               OBJECT_ID   FITS_ID  CCD_ID
500000  c6c3b983afa9452f9b3cab955b41cbcc  1625583p       8
500001  328fb32cc7bd417685c7822af50b7278  1625583p       8
500002  f5f936fab1e44b088dffdc09be0dc449  1625583p       8
500003  0d10fd417e7044228f1b8be0561c4336  1625583p       8
500004  e9efd5dfd0524187bbdecb7e8118d2c2  1625583p       8


In [11]:
# Save the mapping to a parquet file
mapping_filename = os.path.join(root_path, "data/for_modeling", "fits_objects_mapping.parquet.gz")
catalog_df.to_parquet(mapping_filename, compression="gzip", engine="auto")

## Adding the ground truth annotations to the mapping table

In [4]:
filename_mapping = "fits_objects_mapping.parquet.gz"
filename_labels = "map_images_labels.json"

data_path = "/home/mike/git/computational_astro/astro_iqa/data/"
file_path = os.path.join(data_path, "for_modeling")

In [5]:
# Reading fits / ground truth labels
print(f"Reading ground truth labels from {os.path.join(file_path, filename_labels)}")
with open(os.path.join(file_path, filename_labels), 'r') as f:
    labels_gt = json.load(f)



Reading ground truth labels from /home/mike/git/computational_astro/astro_iqa/data/for_modeling/map_images_labels.json


In [None]:
# Reading fits / object mapping
# mapping_fits_obj = pd.read_parquet(os.path.join(file_path, filename_mapping), engine='auto')

In [6]:
# Transform into a dictionary in order to get columns
annotations_dict = {
    "Image_id": list(labels_gt["annotations"].keys()),
    "Label": list(labels_gt["annotations"].values())
}

annotations = pd.DataFrame(annotations_dict)

# A few corrections
# split multiple labels into separate columns
annotations[['Label1', 'Label2']] = annotations['Label'].str.split(', ', expand=True)
# Delete the original column
annotations = annotations.drop(columns=['Label'])
# Add a p to the Image_id column
annotations['Image_id'] = annotations['Image_id'].astype(str) + 'p'


In [8]:
# We add the ground truth labels to the mapping
def add_ground_truth_labels(row, annotations):
    image_id = row["FITS_ID"]
    label1 = annotations.loc[annotations["Image_id"] == image_id, "Label1"].values[0]
    label2 = annotations.loc[annotations["Image_id"] == image_id, "Label2"].values[0]
    return [label1, label2]

catalog_df[["gt_label1", "gt_label2"]] = catalog_df.apply(lambda row: add_ground_truth_labels(row, annotations), result_type="expand", axis=1) 


In [None]:
catalog_df.to_parquet(os.path.join(file_path, filename_mapping), compression="gzip", engine="auto")