In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('D:/MALLORN-Astronomical-Classification-Challenge/data/raw/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_recall_curve
import warnings
from tqdm import tqdm

D:/MALLORN-Astronomical-Classification-Challenge/data/raw/test_log.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/train_log.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_01\test_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_01\train_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_02\test_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_02\train_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_03\test_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_03\train_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_04\test_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_04\train_full_lightcurves.csv
D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_05\test_full_lightcurves.csv
D:/MALLORN-Astro

In [16]:
# Cấu hình hệ thống
# Tắt các cảnh báo không cần thiết để giữ output sạch sẽ
warnings.filterwarnings('ignore')

# Đường dẫn dữ liệu (Cần điều chỉnh tùy theo môi trường thực tế)
PATH_TO_DATA = 'D:/MALLORN-Astronomical-Classification-Challenge/data/raw/' 
N_SPLITS = 20        # Số lượng thư mục split theo cấu trúc dữ liệu MALLORN
FOLDS = 5            # Số lượng fold cho Stratified Cross-Validation
SEED = 42            # Random seed để đảm bảo tái lập kết quả

In [17]:
R_LAMBDA = {
    'u': 4.239, 'g': 3.303, 'r': 2.285, 'i': 1.698, 'z': 1.263, 'y': 1.086
}

def de_extinct_flux(df):
    """
    Hàm vector hóa hiệu chỉnh thông lượng (Flux) dựa trên tuyệt chủng Galactic.
    """
    # Tạo cột tạm R_lambda
    df['R_lambda'] = df['Filter'].map(R_LAMBDA)
    
    # Tính độ hấp thụ A_lambda = R_lambda * EBV
    df['A_lambda'] = df['R_lambda'] * df['EBV']
    
    # Tính Intrinsic Flux: F_corr = F_obs * 10^(0.4 * A_lambda)
    df['Flux_Corr'] = df['Flux'] * np.power(10, 0.4 * df['A_lambda'])
    
    # Xóa cột tạm
    df.drop(columns=['R_lambda', 'A_lambda'], inplace=True)
    return df

In [18]:
def process_lightcurve_chunk(df_lc, df_meta):
    # Merge metadata (Z, EBV) - Lưu ý dùng 'Z' hoa
    df_merged = df_lc.merge(df_meta, on='object_id', how='left')
    
    # De-extinction
    df_merged = de_extinct_flux(df_merged)
    
    features = []
    grouped = df_merged.groupby('object_id')
    
    for obj_id, group in grouped:
        obj_feats = {'object_id': obj_id}
        
        # Lấy đặc trưng tĩnh
        obj_feats['z'] = group['Z'].iloc[0] 
        obj_feats['EBV'] = group['EBV'].iloc[0]
        
        # Thống kê toàn cục
        all_fluxes = group['Flux_Corr'].values
        obj_feats['global_max_flux'] = np.max(all_fluxes)
        obj_feats['global_std_flux'] = np.std(all_fluxes)
        obj_feats['total_detections'] = len(group)
        
        # Thống kê theo Band
        bands = ['u', 'g', 'r', 'i', 'z', 'y']
        band_maxes = {} 
        
        for band in bands:
            band_data = group[group['Filter'] == band]
            
            if len(band_data) > 0:
                fluxes = band_data['Flux_Corr'].values
                errs = band_data['Flux_err'].values
                mjd = band_data['Time (MJD)'].values 
                
                max_f = np.max(fluxes)
                band_maxes[band] = max_f
                
                obj_feats[f'{band}_max'] = max_f
                obj_feats[f'{band}_mean'] = np.mean(fluxes)
                obj_feats[f'{band}_std'] = np.std(fluxes)
                
                # SNR > 3
                snr = fluxes / (errs + 1e-9)
                n_det = np.sum(snr > 3)
                obj_feats[f'{band}_n_det'] = n_det
                
                # Skewness
                if len(fluxes) > 2:
                    obj_feats[f'{band}_skew'] = pd.Series(fluxes).skew()
                else:
                    obj_feats[f'{band}_skew'] = 0
                
                # Rise Time Proxy
                if n_det > 0:
                    peak_idx = np.argmax(fluxes)
                    peak_mjd = mjd[peak_idx]
                    det_mjds = mjd[snr > 3]
                    if len(det_mjds) > 0:
                        first_det = np.min(det_mjds)
                        obj_feats[f'{band}_rise_time'] = (peak_mjd - first_det) / (1 + obj_feats['z'])
                    else:
                        obj_feats[f'{band}_rise_time'] = 0
                else:
                    obj_feats[f'{band}_rise_time'] = -1
                    
            else:
                band_maxes[band] = np.nan
                obj_feats[f'{band}_max'] = np.nan
                obj_feats[f'{band}_mean'] = np.nan
                obj_feats[f'{band}_std'] = np.nan
                obj_feats[f'{band}_n_det'] = 0
                obj_feats[f'{band}_skew'] = 0
                obj_feats[f'{band}_rise_time'] = -1

        # Flux Ratios
        pairs = [('u','g'), ('g','r'), ('r','i'), ('i','z'), ('z','y')]
        for b1, b2 in pairs:
            val1 = band_maxes[b1]
            val2 = band_maxes[b2]
            feat_name = f'{b1}_{b2}_ratio'
            
            if not np.isnan(val1) and not np.isnan(val2) and val2 != 0:
                obj_feats[feat_name] = val1 / val2
            else:
                obj_feats[feat_name] = -999 

        features.append(obj_feats)
        
    return pd.DataFrame(features)

In [19]:
#DATA PIPELINE (FOLDER SCANNING)
def load_data_and_extract_features(mode='train'):
    print(f"=== Bắt đầu xử lý tập {mode.upper()} ===")
    
    # 1. Đọc Metadata
    # Tìm file log ở thư mục gốc hoặc thư mục con
    log_filename = f"{mode}_log.csv"
    log_path = os.path.join(PATH_TO_DATA, log_filename)
    
    if not os.path.exists(log_path):
        # Thử tìm đệ quy nếu không thấy ở root
        for root, dirs, files in os.walk(PATH_TO_DATA):
            if log_filename in files:
                log_path = os.path.join(root, log_filename)
                break
    
    if not os.path.exists(log_path):
        print(f"LỖI: Không tìm thấy file {log_filename}")
        return None
        
    print(f"-> Đọc metadata từ: {log_path}")
    cols_to_use = ['object_id', 'Z', 'EBV'] # Dùng Z hoa
    if mode == 'train':
        cols_to_use.append('target')
        
    df_log = pd.read_csv(log_path, usecols=cols_to_use)
    
    all_features_dfs = []
    
    # 2. Loop qua các Folder Split (01 -> 20)
    for i in range(1, N_SPLITS + 1):
        folder_name = f"split_{i:02d}"
        csv_filename = f"{mode}_full_lightcurves.csv"
        
        # Xây dựng đường dẫn file
        # Case 1: split_01/train_full_lightcurves.csv
        file_path = os.path.join(PATH_TO_DATA, folder_name, csv_filename)
        
        # Case 2: split_01/split_01/train_full_lightcurves.csv (Folder lồng nhau)
        if not os.path.exists(file_path):
            file_path = os.path.join(PATH_TO_DATA, folder_name, folder_name, csv_filename)
            
        if not os.path.exists(file_path):
            # Nếu vẫn không thấy, thử tìm bất kỳ đâu trong folder split
            found = False
            split_root = os.path.join(PATH_TO_DATA, folder_name)
            if os.path.exists(split_root):
                for root, dirs, files in os.walk(split_root):
                    if csv_filename in files:
                        file_path = os.path.join(root, csv_filename)
                        found = True
                        break
            if not found:
                continue # Bỏ qua split này nếu không tìm thấy file

        print(f"Đang xử lý: {file_path}")
        
        # Đọc dữ liệu
        df_lc = pd.read_csv(file_path)
        
        # Lọc metadata & Feature Engineering
        unique_ids = df_lc['object_id'].unique()
        df_log_subset = df_log[df_log['object_id'].isin(unique_ids)]
        
        df_feats = process_lightcurve_chunk(df_lc, df_log_subset)
        
        if mode == 'train':
            df_feats = df_feats.merge(df_log_subset[['object_id', 'target']], on='object_id', how='left')
            
        all_features_dfs.append(df_feats)
        
        # Clean memory
        del df_lc, df_feats, df_log_subset
        gc.collect()

    if not all_features_dfs:
        print("LỖI: Không đọc được dữ liệu nào! Hãy kiểm tra lại PATH_TO_DATA.")
        return None
        
    full_df = pd.concat(all_features_dfs, ignore_index=True)
    print(f"Hoàn tất {mode}. Shape: {full_df.shape}")
    return full_df

In [20]:
def train_and_submit():
    # 1. Prepare Data
    df_train = load_data_and_extract_features(mode='train')
    if df_train is None: return

    feature_cols = [c for c in df_train.columns if c not in ['object_id', 'target']]
    X = df_train[feature_cols]
    y = df_train['target']
    
    print(f"\nBắt đầu huấn luyện trên {X.shape} mẫu...")

    # 2. K-Fold
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    models = []
    
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'is_unbalance': True,
        'verbosity': -1,
        'n_jobs': -1,
        'seed': SEED
    }
    
    # 3. Training Loop
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"--> Fold {fold+1}/{FOLDS}")
        X_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
        X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        dtrain = lgb.Dataset(X_tr, label=y_tr)
        dval = lgb.Dataset(X_val, label=y_val, reference=dtrain)
        
        model = lgb.train(
            params, dtrain, 
            valid_sets=[dtrain, dval],
            valid_names=['train', 'valid'],
            num_boost_round=1000,
            callbacks=[
                lgb.early_stopping(stopping_rounds=50),
                lgb.log_evaluation(period=0) # Tắt log từng dòng cho gọn
            ]
        )
        
        models.append(model)
        val_preds = model.predict(X_val, num_iteration=model.best_iteration)
        oof_preds[val_idx] = val_preds
        
    # 4. Threshold Optimization
    precision, recall, thresholds = precision_recall_curve(y, oof_preds)
    f1_scores = 2 * recall * precision / (recall + precision + 1e-10)
    best_thresh = thresholds[np.argmax(f1_scores)]
    print(f"\nBest Threshold: {best_thresh:.4f}")
    print(f"Best CV F1: {np.max(f1_scores):.4f}")
    
    # 5. Inference
    df_test = load_data_and_extract_features(mode='test')
    if df_test is not None:
        X_test = df_test[feature_cols]
        test_preds = np.zeros(len(X_test))
        
        for model in models:
            test_preds += model.predict(X_test, num_iteration=model.best_iteration) / FOLDS
            
        final_predictions = (test_preds >= best_thresh).astype(int)
        
        submission = pd.DataFrame({
            'object_id': df_test['object_id'],
            'prediction': final_predictions
        })
        
        # Merge với sample_submission
        sample_filename = "sample_submission.csv"
        # Tìm sample submission
        sample_path = os.path.join(PATH_TO_DATA, sample_filename)
        if not os.path.exists(sample_path):
             for root, dirs, files in os.walk(PATH_TO_DATA):
                if sample_filename in files:
                    sample_path = os.path.join(root, sample_filename)
                    break
                    
        if os.path.exists(sample_path):
            sample = pd.read_csv(sample_path)
            submission = sample[['object_id']].merge(submission, on='object_id', how='left')
            submission['prediction'] = submission['prediction'].fillna(0).astype(int)
            
        submission.to_csv('submission.csv', index=False)
        print("\n=== HOÀN TẤT! Đã lưu file submission.csv ===")

In [21]:
if __name__ == '__main__':
    train_and_submit()

=== Bắt đầu xử lý tập TRAIN ===
-> Đọc metadata từ: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/train_log.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_01\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_02\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_03\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_04\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_05\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_06\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_07\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astronomical-Classification-Challenge/data/raw/split_08\train_full_lightcurves.csv
Đang xử lý: D:/MALLORN-Astro