In [2]:
import os
import sys

# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

sys.path.append(os.path.abspath("../../src/"))
from data_acquisition_understanding.dnn_datasets_preparation import read_and_concat_catalogs, clean_and_split_catalog

## Loading and preparing the datasets

Reading and merging catalog and mapping files

In [3]:
columns = ["OBJECT_ID", "FITS_ID", "CCD_ID", "ISO0", "BACKGROUND", "ELLIPTICITY", "ELONGATION", "CLASS_STAR", "FLAGS", "EXPTIME"]
data_path = "../../data/"
proc_path = os.path.join(data_path, "processed")
fm_path = os.path.join(data_path, "for_modeling")


In [4]:
catalog_paths = [
        os.path.join(fm_path, "objects_catalog_cadc2.parquet.gz"),
        os.path.join(fm_path, "objects_catalog_ngc0869.parquet.gz"),
        os.path.join(fm_path, "objects_catalog_ngc0896.parquet.gz"),
        os.path.join(fm_path, "objects_catalog_ngc7000.parquet.gz")
    ]

catalog = read_and_concat_catalogs(catalog_paths) 

In [8]:
print(catalog.info())
print(catalog.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2118974 entries, 0 to 2118973
Data columns (total 12 columns):
 #   Column       Dtype  
---  ------       -----  
 0   OBJECT_ID    object 
 1   FITS_ID      object 
 2   CCD_ID       uint8  
 3   ISO0         float32
 4   BACKGROUND   float32
 5   ELLIPTICITY  float32
 6   ELONGATION   float32
 7   CLASS_STAR   float32
 8   FLAGS        int16  
 9   EXPTIME      float32
 10  gt_label1    object 
 11  gt_label2    object 
dtypes: float32(6), int16(1), object(4), uint8(1)
memory usage: 119.2+ MB
None
             CCD_ID          ISO0    BACKGROUND   ELLIPTICITY    ELONGATION  \
count  2.118974e+06  2.118974e+06  2.118974e+06  2.118974e+06  2.118974e+06   
mean   1.899900e+01  1.737785e+00  1.851034e+00  4.418779e-01  2.837024e+00   
std    1.183598e+01  5.842272e-01  6.844337e-01  2.466583e-01  3.821320e+01   
min    1.000000e+00  4.771213e-01 -2.000000e+00  1.000000e-05  1.000000e+00   
25%    9.000000e+00  1.301030e+00  2.000000e+00  

In [10]:
for col in catalog.columns:
    print(col, catalog[col].value_counts())

OBJECT_ID OBJECT_ID
a297516c3e50409b977becdf59005576    1
057970e69b654b1ea6066ff6e2ceb89a    1
fe42a43758b8437eb89fb3f4c384f708    1
e0ff049eb93645f68b640c94e865a079    1
fdc5c1ff1e9048b09c97c55942e9a446    1
                                   ..
471e392ecdd341dbbe5fdffbc3f2aa55    1
6d50600e12534934843c8b43f27cba4c    1
353bbb9417e94c6f92e4df8bc88d23f4    1
473fd6881a2d4734809e3b429f845742    1
a882a1bbaeb448a5a9a8accefc2b18b3    1
Name: count, Length: 2118974, dtype: int64
FITS_ID FITS_ID
1625632p                    215728
1625633p                    210245
1635753p                    180874
2120820p                    120101
1110042p                    119992
                             ...  
ngc7000_080624_raw_00085        28
ngc7000_080624_raw_00081        25
ngc0869_030524_raw_00016        25
ngc7000_080624_raw_00076        25
ngc0869_030524_raw_00014        22
Name: count, Length: 200, dtype: int64
CCD_ID CCD_ID
1     74484
2     64062
8     63655
20    61925
9     61533
31   

In [13]:
# looking for images that would not be annotated
print(catalog[catalog["gt_label1"].isna()]["FITS_ID"].value_counts())

FITS_ID
1625589p    107977
1625588p    103320
Name: count, dtype: int64


In [14]:
# correcting the labels
catalog.loc[catalog["FITS_ID"] == "1625589p", "gt_label1"] = "GOOD"
catalog.loc[catalog["FITS_ID"] == "1625588p", "gt_label1"] = "B_SEEING"

In [15]:
# Checking the correction
print(catalog[catalog["gt_label1"].isna()]["FITS_ID"].value_counts())

Series([], Name: count, dtype: int64)


In [16]:
catalog_cleaned = catalog.dropna(subset=["gt_label1"])
catalog_cleaned = catalog_cleaned.drop(columns=["gt_label2", "OBJECT_ID"])

In [6]:
train_df, val_df, test_df, class_weights = clean_and_split_catalog(
    catalog,
    "gt_label1",
    ["gt_label2", "OBJECT_ID"],
    0.6,
    0.5,
    42
)

Class weights:
{'GOOD': 0.5751560667765035, 'BT': 0.206570084977698, 'RBT': 0.1958864105401491, 'B_SEEING': 0.01747255955803839, 'BGP': 0.004914878147610943}
-----------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df[col] = train_df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df[col] = val_df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df[col] = test_df[col].astype(str)


In [7]:
# verifying all the classes are present in the datasets
class_weights_train = train_df["gt_label1"].value_counts(normalize=True)
class_weights_train = class_weights_train.to_dict()
print("Class weights train:")
print(class_weights_train)
print("-----------------")
class_weights_val = val_df["gt_label1"].value_counts(normalize=True)
class_weights_val = class_weights_val.to_dict()
print("Class weights val:")
print(class_weights_val)
print("-----------------")
class_weights_test = test_df["gt_label1"].value_counts(normalize=True)
class_weights_test = class_weights_test.to_dict()
print("Class weights test:")
print(class_weights_test)
print("-----------------")

Class weights train:
{'GOOD': 0.5747855150868505, 'BT': 0.20662847004545526, 'RBT': 0.1960496008912933, 'B_SEEING': 0.017632903810523338, 'BGP': 0.0049035101658776465}
-----------------
Class weights val:
{'GOOD': 0.5757791204380233, 'BT': 0.206528896782775, 'RBT': 0.1952844816051757, 'B_SEEING': 0.017396574205007256, 'BGP': 0.005010926969018829}
-----------------
Class weights test:
{'GOOD': 0.5756463662882206, 'BT': 0.20643585654026148, 'RBT': 0.1959979853518289, 'B_SEEING': 0.017066798178422278, 'BGP': 0.00485299364126671}
-----------------


In [None]:
import pandas as pd

# assuming your datasets are DataFrames named training_df, validation_df, testing_df
for df, name in [(train_df, "Training"), (val_df, "Validation"), (test_df, "Testing")]:
    for feature in ["FITS_ID", "CCD_ID", "FLAGS"]:
        unique_values = df[feature].nunique()
        print(f"{name} - {feature}: {unique_values} unique values")

        # Optionally, to view the unique values:
        # print(f"Unique values: {df[feature].unique()}")

IndentationError: unexpected indent (3032459191.py, line 4)