In [1]:
!pip install PyWavelets statsmodels



In [57]:
import pandas as pd
import numpy as np
import os
import gc
import warnings
from scipy.optimize import curve_fit
from scipy.stats import skew, kurtosis, linregress, median_abs_deviation
from joblib import Parallel, delayed

# Import các thư viện ML cổ điển
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import ConstantKernel, Matern
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

# Import Boosting Models
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Tắt cảnh báo cho gọn
warnings.filterwarnings("ignore")

# ==============================================================================
# 1. LIGHTCURVE PROCESSOR (GIỮ NGUYÊN - ĐÃ TỐI ƯU)
# ==============================================================================
class LightCurveProcessor:
    EXTINCTION_COEFFS = {'u': 4.239, 'g': 3.303, 'r': 2.285, 'i': 1.698, 'z': 1.263, 'y': 1.086}

    def __init__(self, metadata_path):
        self.metadata = pd.read_csv(metadata_path)
        if 'object_id' in self.metadata.columns:
            self.metadata.set_index('object_id', inplace=True)
        
    def correct_extinction(self, df):
        if 'EBV' not in df.columns:
            df = df.merge(self.metadata[['EBV']], left_on='object_id', right_index=True, how='left')
        df['R_lambda'] = df['Filter'].map(self.EXTINCTION_COEFFS)
        df['A_lambda'] = df['R_lambda'] * df['EBV']
        correction_factor = np.power(10, 0.4 * df['A_lambda'])
        df['Flux_corr'] = df['Flux'] * correction_factor
        df['Flux_err_corr'] = df['Flux_err'] * correction_factor
        df.dropna(subset=['Flux_corr'], inplace=True)
        return df.drop(columns=['R_lambda', 'A_lambda', 'EBV'])

    def load_and_process(self, lc_path):
        raw_df = pd.read_csv(lc_path)
        try:
            return self.correct_extinction(raw_df)
        except:
            return raw_df

In [58]:
# ==============================================================================
# 2. PHYSICS FEATURE EXTRACTOR (GIỮ NGUYÊN BẢN V5 CỦA BẠN)
# ==============================================================================
import numpy as np
from scipy.optimize import curve_fit
from scipy.stats import skew, kurtosis, linregress, median_abs_deviation, anderson
from joblib import Parallel, delayed

class PhysicsFeatureExtractor:
    def __init__(self):
        self.bands = ['u', 'g', 'r', 'i', 'z', 'y']
        # Cập nhật số lượng feature ước tính (Tăng thêm khoảng 10 feat/band)
        self.n_feat_per_band = 58 
        self.total_features = 0 

    @staticmethod
    def bazin(t, A, B, t0, t_fall, t_rise):
        with np.errstate(over='ignore', invalid='ignore'):
            exp_fall = np.exp(-(t - t0) / np.clip(t_fall, 1e-3, 500))
            exp_rise = np.exp(-(t - t0) / np.clip(t_rise, 1e-3, 200))
            val = A * (exp_fall / (1 + exp_rise)) + B
        return np.nan_to_num(val)

    def fit_bazin(self, times, fluxes):
        if len(fluxes) <= 5: return [0]*6
        try:
            t_max_idx = np.argmax(fluxes)
            t_max = times[t_max_idx]
            f_max = fluxes[t_max_idx]
            f_min = np.min(fluxes)
            p0 = [f_max - f_min, f_min, t_max, 50, 20]
            bounds = ([0, -np.inf, times.min()-50, 0.1, 0.1], [np.inf, np.inf, times.max()+50, 500, 200])
            popt, _ = curve_fit(self.bazin, times, fluxes, p0=p0, bounds=bounds, maxfev=1000)
            return list(popt) + [0]
        except:
            return [0]*6

    def calculate_fft_features(self, times, fluxes):
        """Giữ nguyên FFT cũ nhưng thêm check độ dài"""
        if len(fluxes) < 5: return [0]*6
        try:
            idx = np.argsort(times)
            t_sorted, f_sorted = times[idx], fluxes[idx]
            t_uniform = np.linspace(t_sorted[0], t_sorted[-1], len(t_sorted))
            f_uniform = np.interp(t_uniform, t_sorted, f_sorted)
            fft_vals = np.fft.rfft(f_uniform)
            fft_freq = np.fft.rfftfreq(len(f_uniform))
            power = np.abs(fft_vals[1:])
            freqs = fft_freq[1:]
            if len(power) == 0: return [0]*6
            top_indices = np.argsort(power)[-3:][::-1]
            res = []
            for i in range(3):
                if i < len(top_indices): res.extend([freqs[top_indices[i]], power[top_indices[i]]])
                else: res.extend([0, 0])
            return res
        except:
            return [0]*6

    def calculate_stats(self, times, fluxes, errors, z_factor):
        if len(fluxes) == 0: return [0] * (self.n_feat_per_band - 6) # Trừ 6 feat Bazin
        
        f_std = np.std(fluxes); f_mean = np.mean(fluxes)
        f_max = np.max(fluxes); f_min = np.min(fluxes); f_median = np.median(fluxes)
        
        # --- GROUP 1: BASIC STATS (8) ---
        res = [f_max, f_min, f_mean, f_std,
               skew(fluxes) if len(fluxes) > 2 else 0,
               kurtosis(fluxes) if len(fluxes) > 2 else 0,
               (f_max - f_min) / 2, f_median]
        
        # --- GROUP 2: PERCENTILES & ROBUST RANGE (10) ---
        ps = np.percentile(fluxes, [5, 10, 25, 30, 70, 75, 90, 95])
        res.extend(ps)
        res.append(ps[5] - ps[2]) # IQR (75-25)
        res.append(ps[7] - ps[0]) # Robust Range (95-5) [NEW]
        
        # --- GROUP 3: RATIOS (5) ---
        res.append(np.mean(fluxes**2))
        res.append(np.mean(np.divide(fluxes, errors + 1e-6)))
        res.append(np.sum(fluxes > f_mean) / len(fluxes))
        res.append(np.sum(np.abs(fluxes - f_mean) > f_std) / len(fluxes))
        res.append(len(fluxes))
        
        # --- GROUP 4: ADVANCED VARIABILITY (Stetson, Von Neumann) (4) ---
        delta = np.divide(fluxes - f_mean, errors + 1e-9)
        res.append(np.sum(np.abs(delta)) / len(fluxes) * np.sqrt(1.0/len(fluxes))) # Stetson K
        
        if len(fluxes) > 1:
            eta = np.sum(np.diff(fluxes)**2) / (len(fluxes) - 1) / (f_std**2 + 1e-9)
            res.append(eta) # Von Neumann
            chi2 = np.sum(((fluxes - f_mean) / (errors + 1e-9))**2) / (len(fluxes) - 1)
            res.append(chi2)
            res.append(median_abs_deviation(fluxes, scale='normal')) # MAD
        else: res.extend([0, 0, 0])

        # --- GROUP 5: TIME DOMAIN DYNAMICS (AutoCorr, Derivative) (3) ---
        if len(fluxes) > 2:
            acf_1 = np.corrcoef(fluxes[:-1], fluxes[1:])[0, 1]
            res.append(acf_1 if not np.isnan(acf_1) else 0)
        else: res.append(0)

        if len(fluxes) > 1:
            diffs = np.diff(fluxes)
            res.extend([np.mean(diffs), np.std(diffs)])
        else: res.extend([0, 0])

        # --- GROUP 6: LINEAR TRENDS (Slope) (2) ---
        try:
            slope, intercept, _, _, _ = linregress(times, fluxes)
            res.extend([slope, intercept])
        except: res.extend([0, 0])

        # --- GROUP 7: [NEW] CUSUM & ANDERSON-DARLING (3) ---
        # CUSUM: Cumulative Sum Range - Đo độ "trôi" của dữ liệu so với trung bình
        # Giúp phân biệt biến thiên ngẫu nhiên (noise) vs biến thiên có cấu trúc
        if len(fluxes) > 3:
            cusum = np.cumsum(fluxes - f_mean)
            res.append(np.max(cusum) - np.min(cusum)) # CUSUM Range
            
            # Anderson-Darling Test: Đo xem flux có tuân theo phân phối chuẩn không
            # Black hole thường có "heavy tail" (đuôi nặng), không chuẩn
            try:
                ad_stat = anderson(fluxes, dist='norm').statistic
                res.append(ad_stat)
            except: res.append(0)
        else:
            res.extend([0, 0])

        # Luminosity Proxy
        res.append(f_max * z_factor)
        
        # --- GROUP 8: FFT (6) ---
        res.extend(self.calculate_fft_features(times, fluxes))

        # Padding (Tự động tính toán để khớp n_feat_per_band)
        current_len = len(res)
        padding_len = (self.n_feat_per_band - 6) - current_len
        if padding_len > 0: res.extend([0] * padding_len)
        
        return res

    def extract_features(self, df_object, meta_z=0, meta_ebv=0):
        features = []; band_max_flux = {}; band_std = {}; band_slopes = {}
        z_factor = meta_z**2 if meta_z > 0.001 else 1.0
        
        for band in self.bands:
            band_data = df_object[df_object['Filter'] == band]
            if len(band_data) < 3:
                features.extend([0] * self.n_feat_per_band)
            else:
                times = band_data['Time'].values
                fluxes = band_data['Flux_corr'].values
                errors = band_data['Flux_err_corr'].values
                
                band_max_flux[band] = np.max(fluxes)
                band_std[band] = np.std(fluxes)
                
                # Tính stats
                stats = self.calculate_stats(times, fluxes, errors, z_factor)
                
                # Lưu slope để tính color evolution
                # Slope nằm ở vị trí thứ 24 (index 23) trong stats (theo thứ tự add bên trên)
                # Nhưng để an toàn ta tính lại nhanh
                try:
                    s, _, _, _, _ = linregress(times, fluxes)
                    band_slopes[band] = s
                except: band_slopes[band] = 0
                
                # Fit Bazin
                bazin = self.fit_bazin(times, fluxes) if band in ['g', 'r', 'i'] else [0]*6
                features.extend(stats + bazin)

        # --- CROSS-BAND FEATURES (GLOBAL) ---
        color_pairs = [('u', 'g'), ('g', 'r'), ('r', 'i'), ('i', 'z'), ('z', 'y')]
        
        for b1, b2 in color_pairs:
            # 1. Flux Colors
            val1 = band_max_flux.get(b1, 0); val2 = band_max_flux.get(b2, 0)
            if val1 > 0 and val2 > 0:
                c = -2.5 * np.log10(val1/val2)
                features.extend([c, c * meta_z])
            else: features.extend([0, 0])
            
            # 2. Amplitude & Std Ratios
            amp1 = band_max_flux.get(b1, 1e-9); amp2 = band_max_flux.get(b2, 1e-9)
            std1 = band_std.get(b1, 1e-9); std2 = band_std.get(b2, 1e-9)
            features.append(amp1 / (amp2 + 1e-9))
            features.append(std1 / (std2 + 1e-9))
            
            # 3. [NEW] Slope Differences (Color Evolution)
            # Nếu u tăng nhanh hơn g -> Vật thể đang xanh hóa (becoming bluer)
            s1 = band_slopes.get(b1, 0)
            s2 = band_slopes.get(b2, 0)
            features.append(s1 - s2)

        # Metadata
        features.extend([meta_z, meta_ebv])
        
        self.total_features = len(features)
        return np.array(features)

In [59]:
# ==============================================================================
# 3. SMART FEATURE SELECTOR V3 (CHỌN LỌC TINH HOA)
# ==============================================================================
from lightgbm import LGBMClassifier
from sklearn.feature_selection import SelectFromModel

class SmartFeatureSelector:
    def __init__(self, variance_thresh=0.0, correlation_thresh=0.99, max_features=300):
        # max_features=300: Giữ lại 300 feature quan trọng nhất
        self.var_thresh = VarianceThreshold(threshold=variance_thresh)
        self.corr_thresh = correlation_thresh
        self.max_features = max_features
        self.drop_cols_corr = []
        self.selector_model = None
        
    def fit(self, X, y):
        # --- BƯỚC 1: Lọc Hằng Số (Variance) ---
        print("-> [Selector] 1. Lọc Variance thấp...")
        self.var_thresh.fit(X)
        X_v = self.var_thresh.transform(X)
        print(f"   Giữ lại {X_v.shape[1]} features sau bước Variance.")
        
        # --- BƯỚC 2: Lọc Trùng Lặp (Correlation) ---
        print(f"-> [Selector] 2. Lọc Correlation cao (> {self.corr_thresh})...")
        # Dùng mẫu thử 10k dòng để tính correlation cho nhanh
        if X_v.shape[0] > 10000:
            idx = np.random.choice(X_v.shape[0], 10000, replace=False)
            X_sample = X_v[idx]
        else:
            X_sample = X_v
            
        df_tmp = pd.DataFrame(X_sample)
        corr_matrix = df_tmp.corr().abs()
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        self.drop_cols_corr = [column for column in upper.columns if any(upper[column] > self.corr_thresh)]
        print(f"   Loại bỏ {len(self.drop_cols_corr)} features trùng lặp.")
        
        # Chuẩn bị dữ liệu cho bước 3
        # (Tạo mask để biết cột nào được giữ lại sau bước Correlation)
        mask_corr = np.ones(X_v.shape[1], dtype=bool)
        mask_corr[self.drop_cols_corr] = False
        X_c = X_v[:, mask_corr]
        
        # --- BƯỚC 3: Lọc Quan Trọng (Feature Importance) ---
        print(f"-> [Selector] 3. Dùng LightGBM chọn Top {self.max_features} features...")
        
        # LightGBM nhẹ để đánh giá
        lgb = LGBMClassifier(n_estimators=100, learning_rate=0.05, 
                             random_state=42, n_jobs=-1, verbose=-1)
        lgb.fit(X_c, y)
        
        # Chọn features có độ quan trọng cao nhất, giới hạn đúng số lượng max_features
        self.selector_model = SelectFromModel(lgb, max_features=self.max_features, prefit=True)
        
        # In ra kết quả kiểm tra
        selected_count = np.sum(self.selector_model.get_support())
        print(f"   => Đã chọn lọc được {selected_count} features tinh túy nhất từ {X_c.shape[1]}.")
            
        return self

    def transform(self, X):
        # 1. Variance
        X_v = self.var_thresh.transform(X)
        
        # 2. Correlation
        mask_corr = np.ones(X_v.shape[1], dtype=bool)
        mask_corr[self.drop_cols_corr] = False
        X_c = X_v[:, mask_corr]
        
        # 3. Importance
        X_final = self.selector_model.transform(X_c)
        return X_final

In [60]:
# ==============================================================================
# 4. HYBRID ENSEMBLE (GIỮ NGUYÊN - ĐÃ CẤU HÌNH TỐT)
# ==============================================================================
class HybridEnsemble:
    def __init__(self, scale_pos_weight=1.0):
        # Tăng colsample_bytree và reg_alpha để chống nhiễu khi nhiều features
        self.xgb = XGBClassifier(
            n_estimators=600, learning_rate=0.03, max_depth=6,
            scale_pos_weight=scale_pos_weight,
            colsample_bytree=0.6, # Subsample features
            reg_alpha=0.5,
            eval_metric='logloss', use_label_encoder=False,
            n_jobs=-1
        )
        self.lgbm = LGBMClassifier(
            n_estimators=600, learning_rate=0.03,
            scale_pos_weight=scale_pos_weight,
            colsample_bytree=0.6,
            reg_alpha=0.5,
            verbose=-1
        )
        self.cat = CatBoostClassifier(
            iterations=600, learning_rate=0.03, 
            scale_pos_weight=scale_pos_weight,
            verbose=0, allow_writing_files=False,
            task_type="CPU" 
        )
        self.meta = LogisticRegression()
        self.scaler = StandardScaler()

    def fit(self, X, y):
        print("-> [Ensemble] Chuẩn hóa dữ liệu...")
        X_scaled = self.scaler.fit_transform(X)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        meta_features = np.zeros((len(y), 3))
        
        print("-> [Ensemble] Training Level 0 (CV Stacking)...")
        for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y)):
            X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            self.xgb.fit(X_train, y_train)
            self.lgbm.fit(X_train, y_train)
            self.cat.fit(X_train, y_train)
            
            meta_features[val_idx, 0] = self.xgb.predict_proba(X_val)[:, 1]
            meta_features[val_idx, 1] = self.lgbm.predict_proba(X_val)[:, 1]
            meta_features[val_idx, 2] = self.cat.predict_proba(X_val)[:, 1]
            
        print("-> [Ensemble] Retraining Level 0 on Full Data...")
        self.xgb.fit(X_scaled, y)
        self.lgbm.fit(X_scaled, y)
        self.cat.fit(X_scaled, y)
        
        print("-> [Ensemble] Training Level 1 (Meta Learner)...")
        self.meta.fit(meta_features, y)

    def predict_proba(self, X):
        X_scaled = self.scaler.transform(X)
        p1 = self.xgb.predict_proba(X_scaled)[:, 1]
        p2 = self.lgbm.predict_proba(X_scaled)[:, 1]
        p3 = self.cat.predict_proba(X_scaled)[:, 1]
        meta_features = np.column_stack([p1, p2, p3])
        return self.meta.predict_proba(meta_features)[:, 1]

In [61]:
# ==============================================================================
# 5. PROCESSING FUNCTION (BẢN FIX WARNING)
# ==============================================================================
def process_single_object(obj_id, obj_data, target, n_augment, phys_extractor, meta_z, meta_ebv):
    # --- [FIX 1] Import và Suppress Warning triệt để trong Worker ---
    import warnings
    from sklearn.exceptions import ConvergenceWarning
    warnings.filterwarnings("ignore", category=ConvergenceWarning)
    warnings.filterwarnings("ignore", category=UserWarning)
    
    try:
        if obj_data.empty: return None
        
        # 1. Phys Features
        try:
            phys_feats = phys_extractor.extract_features(obj_data, meta_z, meta_ebv)
        except: return None

        # 2. Gaussian Process -> Raw Grid Features
        GRID_POINTS = 50 
        time_grid = np.linspace(-50, 150, GRID_POINTS) 
        
        if 'Flux_corr' in obj_data.columns:
            t_peak_global = obj_data.loc[obj_data['Flux_corr'].idxmax(), 'Time']
        else:
            t_peak_global = obj_data.loc[obj_data['flux'].idxmax(), 'Time']
            
        # --- [FIX 2] Nới rộng biên độ dưới (1e-10) để tránh ConvergenceWarning ---
        # constant_value_bounds=(1e-10, 1000.0) cho phép nó tiến về 0 mà không bị warning
        kernel = ConstantKernel(1.0, constant_value_bounds=(1e-10, 1000.0)) * \
                 Matern(length_scale=20, length_scale_bounds=(1, 500), nu=1.5)
        
        grid_feats_flat = []
        
        for band in ['u', 'g', 'r', 'i', 'z', 'y']:
            band_dat = obj_data[obj_data['Filter'] == band].copy()
            
            flux_col = 'Flux_corr' if 'Flux_corr' in band_dat.columns else 'flux'
            err_col = 'Flux_err_corr' if 'Flux_err_corr' in band_dat.columns else 'flux_err'
            
            band_dat = band_dat.dropna(subset=['Time', flux_col, err_col])
            if band_dat.duplicated(subset=['Time']).any():
                band_dat = band_dat.groupby('Time', as_index=False).agg({flux_col:'mean', err_col:'mean'})
                
            if len(band_dat) < 3:
                interp = np.zeros((n_augment + 1, GRID_POINTS))
            else:
                X = (band_dat['Time'].values - t_peak_global).reshape(-1, 1)
                y = band_dat[flux_col].values
                err = np.maximum(band_dat[err_col].values**2, 1e-6)
                
                try:
                    # n_restarts_optimizer=0 để chạy nhanh hơn, tránh warning lặp lại
                    gpr = GaussianProcessRegressor(kernel=kernel, alpha=err, 
                                                 optimizer='fmin_l_bfgs_b', 
                                                 n_restarts_optimizer=0, 
                                                 normalize_y=True)
                    gpr.fit(X, y)
                    
                    mu = gpr.predict(time_grid.reshape(-1, 1))
                    res = [mu]
                    
                    if n_augment > 0:
                        samples = gpr.sample_y(time_grid.reshape(-1, 1), n_samples=n_augment)
                        for k in range(n_augment): res.append(samples[:, k])
                    
                    interp = np.array(res)
                except:
                    interp = np.zeros((n_augment + 1, GRID_POINTS))
            
            grid_feats_flat.append(interp)

        final_res = []
        for i in range(n_augment + 1):
            single_obj_grid = []
            for b in range(6):
                single_obj_grid.extend(grid_feats_flat[b][i])
            
            combined_feats = np.concatenate([phys_feats, np.array(single_obj_grid)])
            
            final_res.append({
                'features': combined_feats,
                'label': target,
                'obj_id': obj_id
            })
            
        return final_res
    except Exception as e:
        return None

In [62]:
# ==============================================================================
# 6. MAIN PIPELINE
# ==============================================================================
def main():
    print("=== BẮT ĐẦU PIPELINE: FEATURE EXPLOSION & SMART SELECTION ===")
    
    BASE_DIR = '/kaggle/input/data-for-black-hole'
    if not os.path.exists(BASE_DIR):
        BASE_DIR = '/content/drive/MyDrive/data-for-black-hole' # Fallback
        
    TRAIN_META = os.path.join(BASE_DIR, 'train_log.csv')
    TEST_META = os.path.join(BASE_DIR, 'test_log.csv')
    
    train_proc = LightCurveProcessor(TRAIN_META)
    test_proc = LightCurveProcessor(TEST_META) if os.path.exists(TEST_META) else None
    
    train_meta_dict = pd.read_csv(TRAIN_META).set_index('object_id')[['Z', 'EBV', 'target']].to_dict('index')
    test_meta_dict = {}
    if os.path.exists(TEST_META):
        test_meta_dict = pd.read_csv(TEST_META).set_index('object_id')[['Z', 'EBV']].to_dict('index')

    phys_extractor = PhysicsFeatureExtractor()
    split_dirs = sorted([d for d in os.listdir(BASE_DIR) if d.startswith('split_')])
    
    ALL_X, ALL_Y = [], []
    
    # --- PROCESS TRAIN ---
    print("\n[1] XỬ LÝ DỮ LIỆU TRAIN & TẠO FEATURES KHỔNG LỒ...")
    for split in split_dirs:
        path = os.path.join(BASE_DIR, split, 'train_full_lightcurves.csv')
        if not os.path.exists(path): continue
        print(f"  >> Processing {split}...")
        
        lc_df = train_proc.load_and_process(path)
        col_map = {c: 'Time' for c in lc_df.columns if 'time' in c.lower()}
        lc_df.rename(columns=col_map, inplace=True)
        
        tasks = []
        for oid in lc_df['object_id'].unique():
            if oid in train_meta_dict:
                info = train_meta_dict[oid]
                aug = 5 if info['target'] == 1 else 0
                tasks.append((
                    oid, lc_df[lc_df['object_id']==oid].copy(), 
                    info['target'], aug, 
                    phys_extractor, info['Z'], info['EBV']
                ))
        
        results = Parallel(n_jobs=-1)(delayed(process_single_object)(*t) for t in tasks)
        for r in results:
            if r:
                for item in r:
                    ALL_X.append(item['features'])
                    ALL_Y.append(item['label'])
        
        del lc_df, tasks; gc.collect()

    X_arr = np.array(ALL_X)
    y_arr = np.array(ALL_Y)
    print(f"-> Tổng số mẫu Train: {X_arr.shape[0]}")
    print(f"-> Số features ban đầu (Explosion): {X_arr.shape[1]}")
    
    # --- SMART FEATURE SELECTION ---
    print("\n[2] SMART FEATURE SELECTION (LỌC THÔNG MINH)...")
    selector = SmartFeatureSelector(variance_thresh=0.0, correlation_thresh=0.98)
    # Fit vào tập train
    selector.fit(X_arr)
    X_clean = selector.transform(X_arr)
    
    print(f"-> Số features sau khi lọc: {X_clean.shape[1]}")
    
    # --- TRAINING ---
    print("\n[3] HUẤN LUYỆN MODEL ENSEMBLE (STACKING)...")
    pos = np.sum(y_arr==1); neg = np.sum(y_arr==0)
    scale = neg / (pos + 1e-6)
    print(f"-> Scale Weight: {scale:.2f}")
    
    ensemble = HybridEnsemble(scale_pos_weight=scale)
    ensemble.fit(X_clean, y_arr)
    
    # --- TEST & SUBMIT ---
    print("\n[4] DỰ ĐOÁN TEST & SUBMISSION...")
    TEST_IDS, TEST_X_RAW = [], []
    
    for split in split_dirs:
        path = os.path.join(BASE_DIR, split, 'test_full_lightcurves.csv')
        if not os.path.exists(path): continue
        print(f"  >> Processing Test {split}...")
        
        lc_df = test_proc.load_and_process(path)
        col_map = {c: 'Time' for c in lc_df.columns if 'time' in c.lower()}
        lc_df.rename(columns=col_map, inplace=True)
        
        tasks = []
        for oid in lc_df['object_id'].unique():
            z = test_meta_dict.get(oid, {}).get('Z', 0)
            ebv = test_meta_dict.get(oid, {}).get('EBV', 0)
            tasks.append((
                oid, lc_df[lc_df['object_id']==oid].copy(), 
                0, 0, phys_extractor, z, ebv
            ))
            
        results = Parallel(n_jobs=-1)(delayed(process_single_object)(*t) for t in tasks)
        for r in results:
            if r:
                # Test thì không augment, r chỉ có 1 phần tử
                TEST_X_RAW.append(r[0]['features'])
                TEST_IDS.append(r[0]['obj_id'])
                
        del lc_df; gc.collect()
        
    if len(TEST_IDS) > 0:
        X_test_arr = np.array(TEST_X_RAW)
        # Transform tập test theo selector đã fit ở train
        X_test_clean = selector.transform(X_test_arr)
        
        probs = ensemble.predict_proba(X_test_clean)
        
        # Đảm bảo output 0.0-1.0
        probs = np.clip(probs, 0.0, 1.0)
        
        sub = pd.DataFrame({'object_id': TEST_IDS, 'target': probs})
        
        # Merge với sample submission để đảm bảo đủ ID
        SAMPLE = os.path.join(BASE_DIR, 'sample_submission.csv')
        if os.path.exists(SAMPLE):
            sample_df = pd.read_csv(SAMPLE)
            if 'target' in sample_df.columns: del sample_df['target']
            sub = sample_df.merge(sub, on='object_id', how='left').fillna(0)
            
        sub.to_csv('/kaggle/working/submission.csv', index=False)
        print(f"✅ DONE! File saved with {len(sub)} rows.")
    else:
        print("⚠️ No test data found.")

if __name__ == "__main__":
    main()

=== BẮT ĐẦU PIPELINE: FEATURE EXPLOSION & SMART SELECTION ===

[1] XỬ LÝ DỮ LIỆU TRAIN & TẠO FEATURES KHỔNG LỒ...
  >> Processing split_01...
  >> Processing split_02...
  >> Processing split_03...
  >> Processing split_04...
  >> Processing split_05...


  ratio = actual_reduction / predicted_reduction


  >> Processing split_06...
  >> Processing split_07...
  >> Processing split_08...
  >> Processing split_09...
  >> Processing split_10...
  >> Processing split_11...
  >> Processing split_12...
  >> Processing split_13...
  >> Processing split_14...
  >> Processing split_15...
  >> Processing split_16...
  >> Processing split_17...
  >> Processing split_18...
  >> Processing split_19...
  >> Processing split_20...
-> Tổng số mẫu Train: 3783
-> Số features ban đầu (Explosion): 975

[2] SMART FEATURE SELECTION (LỌC TINH HOA)...
-> [Selector] 1. Lọc Variance thấp...
   Giữ lại 888 features sau bước Variance.
-> [Selector] 2. Lọc Correlation cao (> 0.995)...
   Loại bỏ 292 features trùng lặp.
-> [Selector] 3. Dùng LightGBM chọn Top 300 features...
   => Đã chọn lọc được 159 features tinh túy nhất từ 596.
-> Số features sau khi lọc: 159

[3] HUẤN LUYỆN MODEL ENSEMBLE (STACKING)...
-> Scale Weight: 3.26
-> [Ensemble] Chuẩn hóa dữ liệu...
-> [Ensemble] Training Level 0 (CV Stacking)...
-> [

In [64]:
import pandas as pd

# 1. Đọc file kết quả (đảm bảo đường dẫn đúng với nơi file được sinh ra)
# Thường là /kaggle/working/submission.csv
file_path = '/kaggle/working/submission.csv' 
df = pd.read_csv(file_path)

# 2. Áp dụng ngưỡng 0.001 trên cột chứa xác suất (cột 'target')
# Nếu target > 0.001 -> gán là 1, ngược lại là 0
df['prediction'] = (df['target'] > 0.1).astype(int)

# 3. Chỉ giữ lại 2 cột cần thiết: object_id và prediction
final_submission = df[['object_id', 'prediction']]

# 4. Lưu lại file mới hoặc ghi đè file cũ để nộp
final_submission.to_csv('submission_lightgbm++_01.csv', index=False)

print("✅ Đã xử lý xong!")
print("Số lượng TDE (nhãn 1) dự đoán được:", final_submission['prediction'].sum())
print(final_submission.head())

✅ Đã xử lý xong!
Số lượng TDE (nhãn 1) dự đoán được: 374
                      object_id  prediction
0      Eluwaith_Mithrim_nothrim           0
1            Eru_heledir_archam           0
2             Gonhir_anann_fuin           0
3  Gwathuirim_haradrim_tegilbor           0
4              achas_minai_maen           0


In [46]:
import pandas as pd

# 1. Đọc file kết quả chứa xác suất
df = pd.read_csv('submission.csv')

# 2. Áp dụng ngưỡng 0.001 để tạo cột 'prediction' (0 hoặc 1)
df['prediction'] = (df['target'] > 0.0008).astype(int)

# 3. Chỉ giữ lại 2 cột cần thiết: 'object_id' và 'prediction'
final_df = df[['object_id', 'prediction']]

# 4. Lưu file final
final_df.to_csv('submission_final_0008.csv', index=False)

print("Đã lưu file submission_final.csv")
print(final_df.head())
print("\nThống kê số lượng:")
print(final_df['prediction'].value_counts())

Đã lưu file submission_final.csv
                      object_id  prediction
0      Eluwaith_Mithrim_nothrim           0
1            Eru_heledir_archam           0
2             Gonhir_anann_fuin           0
3  Gwathuirim_haradrim_tegilbor           0
4              achas_minai_maen           0

Thống kê số lượng:
prediction
0    6706
1     429
Name: count, dtype: int64
