# Evaluating the power of classification directly from the SOM clustering

In [1]:
import os
import json
import pandas as pd

In [10]:
filename_clusters = "som_iqa_loaded_results_5_clusters.parquet"
filename_mapping = "fits_objects_mapping.parquet.gz"
filename_labels = "map_images_labels.json"

data_path = "/home/mike/git/computational_astro/astro_iqa/data/"
file_path = os.path.join(data_path, "for_modeling")

In [14]:
# Reading clusters from the parquet file
print(f"Reading clusters from {os.path.join(file_path, filename_clusters)}")
clusters = pd.read_parquet(os.path.join(file_path, "Results", filename_clusters))

# Reading fits / ground truth labels
print(f"Reading ground truth labels from {os.path.join(file_path, filename_labels)}")
with open(os.path.join(file_path, filename_labels), 'r') as f:
    labels_gt = json.load(f)



Reading clusters from /home/mike/git/computational_astro/astro_iqa/data/for_modeling/som_iqa_loaded_results_5_clusters.parquet
Reading ground truth labels from /home/mike/git/computational_astro/astro_iqa/data/for_modeling/map_images_labels.json


In [17]:
# Reading fits / object mapping
mapping_fits_obj = pd.read_parquet(os.path.join(file_path, filename_mapping), engine='auto')

In [23]:
# Transform into a dictionary in order to get columns
annotations_dict = {
    "Image_id": list(labels_gt["annotations"].keys()),
    "Label": list(labels_gt["annotations"].values())
}

annotations = pd.DataFrame(annotations_dict)

# A few corrections
# split multiple labels into separate columns
annotations[['Label1', 'Label2']] = annotations['Label'].str.split(', ', expand=True)
# Delete the original column
annotations = annotations.drop(columns=['Label'])
# Add a p to the Image_id column
annotations['Image_id'] = annotations['Image_id'].astype(str) + 'p'


In [27]:
# We add the ground truth labels to the mapping
def add_ground_truth_labels(row, annotations):
    image_id = row["FITS_ID"]
    label1 = annotations.loc[annotations["Image_id"] == image_id, "Label1"].values[0]
    label2 = annotations.loc[annotations["Image_id"] == image_id, "Label2"].values[0]
    return [label1, label2]

mapping_fits_obj[["gt_label1", "gt_label2"]] = mapping_fits_obj.apply(lambda row: add_ground_truth_labels(row, annotations), result_type="expand", axis=1) 


In [None]:
mapping_fits_obj.to_parquet(os.path.join(file_path, filename_mapping), compression="gzip", engine="auto")