In [1]:
# ==========================================
# NOTEBOOK 2: TRAINING - 3 SEEDS BAGGING (95% Data)
# ==========================================

import joblib
from xgboost import XGBClassifier
from catboost import CatBoostClassifier, Pool
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import GroupShuffleSplit
from sklearn.utils.class_weight import compute_sample_weight
import pandas as pd
import numpy as np
import gc
import os
import glob
import shutil
import warnings
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')

# --- AUTO-DETECT PATH ---
def find_feature_path():
    if os.path.exists('./processed_data') and len(glob.glob('./processed_data/*.parquet')) > 0:
        return './processed_data'
    for root, dirs, files in os.walk('/kaggle/input'):
        for file in files:
            if file.endswith('.parquet') and 'train' in file:
                return root
    return None

INPUT_FEATURE_PATH = find_feature_path()
MODEL_SAVE_PATH = './models'
if os.path.exists(MODEL_SAVE_PATH): shutil.rmtree(MODEL_SAVE_PATH)
os.makedirs(MODEL_SAVE_PATH, exist_ok=True)

# Feature List
FEATURE_COLS = [
    'distance', 'velocity_m1', 'velocity_m2', 'accel_m1', 'accel_m2',
    'curvature_m1', 'curvature_m2', 'turn_rate_m1', 'turn_rate_m2', 'speed_ratio_m1',
    'nose1_to_tail2', 'nose1_to_nose2', 'facing_angle_m1',
    'elongation_m1', 'elongation_m2', 'spine_alignment',
    'vel_m1_std_10', 'vel_m2_std_10', 'grooming_score_m1',
    'dist_mean_10', 'vel_m1_mean_10', 'vel_m2_mean_10', 
    'facing_mean_10', 'curvature_m1_mean_10'
]

def load_data_smart_split(feature_path, meta_path, val_ratio=0.1, train_downsample_rate=0.1):
    print("üîÑ ƒêang th·ª±c hi·ªán Smart Loading (Gi·ªØ Val chu·∫©n, Gi·∫£m Train)...")
    
    # B1: L·∫•y danh s√°ch Video ID ƒë·ªÉ chia tr∆∞·ªõc
    meta_df = pd.read_csv(f'{meta_path}/train.csv')
    # L·ªçc b·ªè MABe22 n·∫øu c·∫ßn thi·∫øt nh∆∞ logic c≈©
    meta_df = meta_df[~meta_df['lab_id'].str.startswith('MABe22_')]
    unique_videos = meta_df['video_id'].unique()
    
    # Chia Val/Train d·ª±a tr√™n Video ID
    from sklearn.model_selection import GroupShuffleSplit
    gss = GroupShuffleSplit(n_splits=1, test_size=val_ratio, random_state=42)
    # Dummy data ƒë·ªÉ split
    train_vids_idx, val_vids_idx = next(gss.split(unique_videos, groups=unique_videos))
    
    val_video_ids = set(unique_videos[val_vids_idx])
    print(f"üìã Split Info: {len(train_vids_idx)} Train Videos | {len(val_video_ids)} Validation Videos")
    
    # B2: ƒê·ªçc t·ª´ng file v√† l·ªçc ngay l·∫≠p t·ª©c
    files = glob.glob(f'{feature_path}/*.parquet')
    train_chunks = []
    val_chunks = []
    
    for f in tqdm(files, desc="Smart Loading Parquets"):
        try:
            # ƒê·ªçc file chunk
            df_chunk = pd.read_parquet(f)
            
            # T√°ch d√≤ng Val v√† Train ngay trong chunk n√†y
            mask_val = df_chunk['video_id'].isin(val_video_ids)
            
            # --- X·ª¨ L√ù PH·∫¶N VALIDATION (GI·ªÆ NGUY√äN 100%) ---
            df_val_part = df_chunk[mask_val]
            if not df_val_part.empty:
                val_chunks.append(df_val_part)
            
            # --- X·ª¨ L√ù PH·∫¶N TRAINING (DOWNSAMPLE 'OTHER') ---
            df_train_part = df_chunk[~mask_val]
            if not df_train_part.empty:
                # L·ªçc 'other'
                mask_action = df_train_part['label'] != 'other'
                mask_other = ~mask_action
                
                df_actions = df_train_part[mask_action]
                df_others = df_train_part[mask_other]
                
                # Downsample Others
                if not df_others.empty:
                    df_others = df_others.sample(frac=train_downsample_rate, random_state=42)
                
                # G·ªôp l·∫°i ph·∫ßn train ƒë√£ gi·∫£m nh·∫π
                df_train_reduced = pd.concat([df_actions, df_others], axis=0)
                train_chunks.append(df_train_reduced)
            
            # Gi·∫£i ph√≥ng RAM ngay l·∫≠p t·ª©c cho chunk v·ª´a ƒë·ªçc
            del df_chunk, df_val_part, df_train_part
            gc.collect()
            
        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói ƒë·ªçc file {f}: {e}")

    # B3: G·ªôp k·∫øt qu·∫£ cu·ªëi c√πng
    print("üß© Concatenating Dataframes...")
    X_train_df = pd.concat(train_chunks, axis=0, ignore_index=True)
    X_val_df = pd.concat(val_chunks, axis=0, ignore_index=True)
    
    # FillNA an to√†n
    X_train_df[FEATURE_COLS] = X_train_df[FEATURE_COLS].fillna(0).astype(np.float32)
    X_val_df[FEATURE_COLS] = X_val_df[FEATURE_COLS].fillna(0).astype(np.float32)
    
    return X_train_df, X_val_df

# --- OPTIMIZATION UTILS ---
def optimize_thresholds_fast(y_true, y_probs, num_classes):
    best_thresholds = {}
    # Sample n·∫øu qu√° l·ªõn ƒë·ªÉ ch·∫°y nhanh h∆°n
    if len(y_true) > 500_000:
        idx = np.random.choice(len(y_true), 500_000, replace=False)
        y_t, y_p = y_true[idx], y_probs[idx]
    else:
        y_t, y_p = y_true, y_probs

    print("   ‚öñÔ∏è ƒêang t√¨m Threshold t·ªëi ∆∞u tr√™n t·∫≠p Validation th·ª±c t·∫ø...")
    for i in range(num_classes):
        # B·ªè qua n·∫øu class kh√¥ng t·ªìn t·∫°i trong Val
        if np.sum(y_t == i) == 0:
            best_thresholds[i] = 0.5
            continue
            
        best_f1, best_th = -1, 0.5
        y_true_i = (y_t == i).astype(int)
        probs_i = y_p[:, i]
        
        # Qu√©t threshold t·ª´ 0.2 ƒë·∫øn 0.9
        for th in np.arange(0.2, 0.9, 0.05):
            score = f1_score(y_true_i, (probs_i >= th).astype(int))
            if score > best_f1:
                best_f1, best_th = score, th
        
        best_thresholds[i] = best_th
        # In ra ƒë·ªÉ ki·ªÉm tra (Debug)
        if best_f1 > 0:
            print(f"      Class {i}: Best Threshold={best_th:.2f}, F1={best_f1:.4f}")
            
    return best_thresholds

# --- MAIN TRAIN FUNCTION ---
def train_grandmaster_bagging():
    train_df, val_df = load_data_smart_split(
        INPUT_FEATURE_PATH, 
        '/kaggle/input/MABe-mouse-behavior-detection',
        val_ratio=0.1, 
        train_downsample_rate=0.1 
    )
    
    print(f"‚úÖ Final Train Size: {len(train_df)} (Balanced)")
    print(f"‚úÖ Final Val Size:   {len(val_df)} (Real Distribution)")

    # Chu·∫©n b·ªã d·ªØ li·ªáu cho Model
    le = LabelEncoder()
    
    # Fit LE tr√™n c·∫£ t·∫≠p train v√† val ƒë·ªÉ ch·∫Øc ch·∫Øn ƒë·ªß class
    all_labels = pd.concat([train_df['label'], val_df['label']]).unique()
    le.fit(all_labels)
    
    y_train = le.transform(train_df['label'])
    X_train = train_df[FEATURE_COLS].values
    
    y_val = le.transform(val_df['label'])
    X_val = val_df[FEATURE_COLS].values
    
    # T√≠nh sample weight cho t·∫≠p Train (v√¨ ƒë√£ downsample, c·∫ßn c√¢n b·∫±ng l·∫°i cho model)
    w_train = compute_sample_weight('balanced', y_train)
    
    num_classes = len(le.classes_)
    print(f"üéØ Num Classes: {num_classes}")

    
    eval_idx = np.random.choice(len(X_val), size=min(len(X_val), 100_000), replace=False)
    X_val_eval = X_val[eval_idx]
    y_val_eval = y_val[eval_idx]
    print(f"‚ö° Optimization: Using subset of {len(X_val_eval)} val samples for Early Stopping check.")

    
    try:
        other_idx = le.transform(['other'])[0]
    except:
        other_idx = -1

    # D·ªçn d·∫πp RAM l·∫ßn cu·ªëi tr∆∞·ªõc khi train
    del train_df, val_df
    gc.collect()
    
    # SEEDS for Bagging (3 Models)
    SEEDS = [42, 2023, 9999] 
    
    all_thresholds = []

    for i, seed in enumerate(SEEDS):
        print(f"\n>>> üöÄ TRAINING SEED {seed} ({i+1}/{len(SEEDS)}) ")

        # 3. XGBoost
        print(f"   ‚ö° XGBoost (Seed {seed})...")
        model_xgb = XGBClassifier(
            n_estimators=1500,
            learning_rate=0.07,
            max_depth=7,
            min_child_weight=10,
            subsample=0.8, 
            colsample_bytree=0.8, 
            tree_method='gpu_hist', 
            device='cuda',
            objective='multi:softprob', 
            num_class=num_classes, 
            random_state=seed, 
            n_jobs=-1
        )
        
        model_xgb.fit(
            X_train, y_train, 
            sample_weight=w_train, 
            eval_set=[(X_val_eval, y_val_eval)], 
            early_stopping_rounds=50, 
            verbose=100 # In log m·ªói 100 d√≤ng ƒë·ªÉ theo d√µi ti·∫øn ƒë·ªô
        )
        joblib.dump(model_xgb, f'{MODEL_SAVE_PATH}/xgb_seed{seed}.pkl')
        
        print("      -> Predicting on full validation set...")
        p1 = model_xgb.predict_proba(X_val)
        del model_xgb; gc.collect()

        # 4. CatBoost
        print(f"   üê± CatBoost (Seed {seed})...")
        model_cat = CatBoostClassifier(
            iterations=1500, 
            learning_rate=0.05, 
            depth=7, 
            task_type='GPU', 
            devices='0',
            loss_function='MultiClass', 
            auto_class_weights='SqrtBalanced', 
            random_seed=seed, 
            verbose=False, 
            allow_writing_files=False,
            metric_period=50
        )
        train_pool = Pool(X_train, y_train)
        val_pool_small = Pool(X_val_eval, y_val_eval) 
        
        model_cat.fit(
            train_pool, 
            eval_set=val_pool_small, 
            early_stopping_rounds=50
        )
        model_cat.save_model(f'{MODEL_SAVE_PATH}/cat_seed{seed}.cbm')
        p2 = model_cat.predict_proba(X_val)
        del model_cat, train_pool, val_pool_small; gc.collect()
        
        # 5. Optimize Thresholds for THIS seed
        ens_probs = 0.5 * p1 + 0.5 * p2
        best_ths = optimize_thresholds_fast(y_val, ens_probs, num_classes)
        all_thresholds.append(best_ths)
        
        print(f"   ‚úÖ Seed {seed} Completed.")

    # 6. Average Thresholds (Vote from 3 seeds)
    final_thresholds = {}
    if len(all_thresholds) > 0:
        for k in all_thresholds[0].keys():
            final_thresholds[k] = np.mean([t[k] for t in all_thresholds])
    
    if other_idx != -1:
        final_thresholds[other_idx] = 0.99

    # 7. Save Configs
    joblib.dump(le, f'{MODEL_SAVE_PATH}/label_encoder.pkl')
    joblib.dump(final_thresholds, f'{MODEL_SAVE_PATH}/best_thresholds.pkl')
    print(f"\nüéâ TRAINING DONE. Models & Configs saved to {MODEL_SAVE_PATH}")

if __name__ == "__main__":
    train_grandmaster_bagging()

üîÑ ƒêang th·ª±c hi·ªán Smart Loading (Gi·ªØ Val chu·∫©n, Gi·∫£m Train)...
üìã Split Info: 776 Train Videos | 87 Validation Videos


Smart Loading Parquets:   0%|          | 0/9 [00:00<?, ?it/s]

üß© Concatenating Dataframes...
‚úÖ Final Train Size: 8329663 (Balanced)
‚úÖ Final Val Size:   3428516 (Real Distribution)
üéØ Num Classes: 27
‚ö° Optimization: Using subset of 100000 val samples for Early Stopping check.

>>> üöÄ TRAINING SEED 42 (1/3) 
   ‚ö° XGBoost (Seed 42)...
[0]	validation_0-mlogloss:3.18867
[100]	validation_0-mlogloss:1.94403
[200]	validation_0-mlogloss:1.78497
[300]	validation_0-mlogloss:1.68954
[400]	validation_0-mlogloss:1.62109
[500]	validation_0-mlogloss:1.56541
[600]	validation_0-mlogloss:1.51954
[700]	validation_0-mlogloss:1.47985
[800]	validation_0-mlogloss:1.44477
[900]	validation_0-mlogloss:1.41400
[1000]	validation_0-mlogloss:1.38612
[1100]	validation_0-mlogloss:1.36127
[1200]	validation_0-mlogloss:1.33877
[1300]	validation_0-mlogloss:1.31850
[1400]	validation_0-mlogloss:1.29907
[1499]	validation_0-mlogloss:1.28101
      -> Predicting on full validation set...
   üê± CatBoost (Seed 42)...
   ‚öñÔ∏è ƒêang t√¨m Threshold t·ªëi ∆∞u tr√™n t·∫≠p Valid