# Evaluating the power of classification directly from the SOM clustering

In [4]:
import os
import json
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [2]:
filename_clusters = "som_iqa_loaded_results_5_clusters.parquet"
filename_mapping = "fits_objects_mapping.parquet.gz"
filename_labels = "map_images_labels.json"

data_path = "/home/mike/git/computational_astro/astro_iqa/data/"
file_path = os.path.join(data_path, "for_modeling")

In [3]:
# Reading clusters from the parquet file
print(f"Reading clusters from {os.path.join(file_path, filename_clusters)}")
clusters = pd.read_parquet(os.path.join(file_path, "Results", filename_clusters))

# Reading fits / ground truth labels
print(f"Reading ground truth labels from {os.path.join(file_path, filename_labels)}")
with open(os.path.join(file_path, filename_labels), 'r') as f:
    labels_gt = json.load(f)

print(clusters)

Reading clusters from /home/mike/git/computational_astro/astro_iqa/data/for_modeling/som_iqa_loaded_results_5_clusters.parquet
Reading ground truth labels from /home/mike/git/computational_astro/astro_iqa/data/for_modeling/map_images_labels.json
                                  BMU     Udist  Ret_x  Ret_y  Cub_x  Cub_y  \
057970e69b654b1ea6066ff6e2ceb89a  240  0.041096     14     15      7    -22   
fe42a43758b8437eb89fb3f4c384f708  240  0.041096     14     15      7    -22   
e0ff049eb93645f68b640c94e865a079   15  0.323520     14      0     14    -14   
fdc5c1ff1e9048b09c97c55942e9a446   15  0.323520     14      0     14    -14   
a360c4f1d508437b9ae022fb1334d1cb  240  0.041096     14     15      7    -22   
...                               ...       ...    ...    ...    ...    ...   
a441caa248d948efae5087b15d8fa1d4   46  0.360268      0      3     -1     -2   
ba8fcf38be464484ba3b736bbcfa645a  156  0.183080      5     10      0    -10   
0f5406ec5eb84d8a9516d799e848c522  186  0.29

In [6]:
# Reading fits / object mapping
mapping_fits_obj = pd.read_parquet(os.path.join(file_path, filename_mapping), engine='auto')

In [10]:
# Transform into a dictionary in order to get columns
annotations_dict = {
    "Image_id": list(labels_gt["annotations"].keys()),
    "Label": list(labels_gt["annotations"].values())
}

annotations = pd.DataFrame(annotations_dict)

# A few corrections
# split multiple labels into separate columns
annotations[['Label1', 'Label2']] = annotations['Label'].str.split(', ', expand=True)
# Delete the original column
annotations = annotations.drop(columns=['Label'])
# Add a p to the Image_id column
annotations['Image_id'] = annotations['Image_id'].astype(str) + 'p'

print(labels_gt["categories"])

['GOOD', 'B_SEEING', 'BGP', 'BT', 'RBT']


In [12]:
# Encode ground truth labels
label_encoder = LabelEncoder()
label_encoder.fit(labels_gt["categories"])
print(label_encoder.classes_)
print(label_encoder.transform(annotations[1:5]['Label1']))

['BGP' 'BT' 'B_SEEING' 'GOOD' 'RBT']
[2 2 3 2]


In [15]:
for index, row in mapping_fits_obj.iterrows():
    try:
        row["cluster_id"] = clusters[row["OBJECT_ID"]]["5_clusters"]
    except KeyError:
        continue

In [17]:
print(clusters[mapping_fits_obj[1:5]["OBJECT_ID"]])

KeyError: "None of [Index(['fe42a43758b8437eb89fb3f4c384f708', 'e0ff049eb93645f68b640c94e865a079',\n       'fdc5c1ff1e9048b09c97c55942e9a446', 'a360c4f1d508437b9ae022fb1334d1cb'],\n      dtype='object')] are in the [columns]"