In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder


In [2]:
DATA_DIR = "D:/MALLORN-Astronomical-Classification-Challenge/data/raw"

In [3]:
train_log = pd.read_csv("D:/MALLORN-Astronomical-Classification-Challenge/data/raw/train_log.csv")
test_log = pd.read_csv("D:/MALLORN-Astronomical-Classification-Challenge/data/raw/test_log.csv")
train_log = train_log.drop(columns = ["English Translation"])
test_log = test_log.drop(columns = ["English Translation"])
n_splits = 20
lc_tr = []
lc_te = []
for i in range(1, n_splits + 1):
    split_name = f"split_{i:02d}"
    
    # -------- Train --------
    train_lc_path = os.path.join("D:/MALLORN-Astronomical-Classification-Challenge/data/raw/", split_name, "train_full_lightcurves.csv")
    lc_tr.append(pd.read_csv(train_lc_path))

    # -------- Test --------
    test_lc_path = os.path.join("D:/MALLORN-Astronomical-Classification-Challenge/data/raw/", split_name, "test_full_lightcurves.csv")
    lc_te.append(pd.read_csv(test_lc_path))

In [None]:
def preprocess_and_save():

    # Encoder SpecType in train and test logs
    le = LabelEncoder()
    train_log["SpecType"] = le.fit_transform(train_log["SpecType"])
    test_log["SpecType"] = le.transform(test_log["SpecType"])
    train_label_map = dict(zip(train_log["object_id"], train_log["SpecType"]))
    test_label_map = dict(zip(test_log["object_id"], test_log["SpecType"]))

    # Process 20 splits of lightcurves
    LAMDA = {
        "u": 0,
        "g": 1,
        "r": 2,
        "i": 3,
        "z": 4,
        "y": 5
    }

    for i in tqdm(range(n_splits), desc="Processing splits"):
        #Process train full lightcurves
        lc_tr[i]["Filter"] = lc_tr[i]["Filter"].map(LAMDA)

        flux = lc_tr[i]['Flux'].values
        lc_tr[i]['Flux_Log'] = np.sign(flux) * np.log1p(np.abs(flux))
    
        err = lc_tr[i]['Flux_err'].values
        lc_tr[i]['Err_Log'] = np.log1p(err)

        #Process test full lightcurves
        lc_te[i]["Filter"] = lc_te[i]["Filter"].map(LAMDA)

        flux = lc_te[i]['Flux'].values
        lc_te[i]['Flux_Log'] = np.sign(flux) * np.log1p(np.abs(flux))

        err = lc_te[i]['Flux_err'].values
        lc_te[i]['Err_Log'] = np.log1p(err)
    
        grouped_lc_tr = [lc.groupby("object_id") for lc in lc_tr]
        grouped_lc_te = [lc.groupby("object_id") for lc in lc_te]

        for obj_id, small_df in grouped_lc_tr:
            t_min = small_df['Time'].min()
            small_df['Time'] = (small_df['Time'] - t_min).round(0)
            p_flux = small_df.pivot_table(index='Time', columns='Filter', values='Flux_Log')
            p_err = small_df.pivot_table(index='Time', columns='Filter', values='Err_Log')

            p_flux = p_flux.reindex(columns=range(6), fill_value=0).fillna(0)
            p_err = p_err.reindex(columns=range(6), fill_value=0).fillna(0)

            final_matrix = pd.concat([p_flux, p_err], axis=1)

            save_path = os.path.join("D:/MALLORN-Astronomical-Classification-Challenge/data/processed/train/", f"{obj_id}.npy")
            np.save(save_path, final_matrix.values)
            
        for obj_id, small_df in grouped_lc_te:
            t_min = small_df['Time'].min()
            small_df['Time'] = (small_df['Time'] - t_min).round(0)
            p_flux = small_df.pivot_table(index='Time', columns='Filter', values='Flux_log')
            p_err = small_df.pivot_table(index='Time', columns='Filter', values='Err_log')

            p_flux = p_flux.reindex(columns=range(6), fill_value=0).fillna(0)
            p_err = p_err.reindex(columns=range(6), fill_value=0).fillna(0)
            final_matrix = pd.concat([p_flux, p_err], axis=1)
            save_path = os.path.join("D:/MALLORN-Astronomical-Classification-Challenge/data/processed/test/", f"{obj_id}.npy")
            np.save(save_path, final_matrix.values)
    

In [5]:
print("train_log shape:", train_log.shape)
print("test_log shape:", test_log.shape)
# print("Train Full Lightcurves shape:")
# for i in range(n_splits):
#     print(f" Split {i+1:02d}: {lc_tr[i].shape}")
# print("Test Full Lightcurves shape:")
# for i in range(n_splits):
#     print(f" Split {i+1:02d}: {lc_te[i].shape}")

train_log shape: (3043, 7)
test_log shape: (7135, 6)
