In [1]:
# ==========================================
# CELL 3: GLOBAL INFERENCE & SUBMISSION (FINAL PLATINUM)
# ==========================================
import joblib
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import pandas as pd
import numpy as np
import polars as pl
import os
import glob
import gc
import shutil
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

# --- C·∫§U H√åNH ---
MODEL_PATH = '/kaggle/input/model-testdata/models' 
if not os.path.exists(MODEL_PATH):
    print(f"‚ö†Ô∏è ƒê∆∞·ªùng d·∫´n {MODEL_PATH} kh√¥ng t·ªìn t·∫°i!")
    print("C√°c th∆∞ m·ª•c trong /kaggle/input/model-testdata l√†:")
    try:
        print(os.listdir('/kaggle/input/model-testdata'))
    except:
        print("Kh√¥ng ƒë·ªçc ƒë∆∞·ª£c th∆∞ m·ª•c cha.")
else:
    print(f"‚úÖ ƒê∆∞·ªùng d·∫´n model OK: {MODEL_PATH}")
    print(f"   T√¨m th·∫•y {len(glob.glob(f'{MODEL_PATH}/*.pkl'))} file .pkl")

TEST_DATA_PATH = '/kaggle/input/MABe-mouse-behavior-detection'
SUBMISSION_FILE = 'submission.csv'

# --- FEATURE COLUMNS (PLATINUM - Ph·∫£i kh·ªõp 100% v·ªõi Cell 1 & 2) ---
FEATURE_COLS = [
    'distance', 'velocity_m1', 'velocity_m2', 'accel_m1', 'accel_m2',
    'curvature_m1', 'curvature_m2', 'turn_rate_m1', 'turn_rate_m2', 'speed_ratio_m1',
    'nose1_to_tail2', 'nose1_to_nose2', 'facing_angle_m1',
    'elongation_m1', 'elongation_m2', 'spine_alignment',
    'vel_m1_std_10', 'vel_m2_std_10', 'grooming_score_m1',
    'dist_mean_10', 'vel_m1_mean_10', 'vel_m2_mean_10', 
    'facing_mean_10', 'curvature_m1_mean_10'
]

# --- H√ÄM X·ª¨ L√ù D·ªÆ LI·ªÜU (ƒê∆∞·ª£c nh√∫ng tr·ª±c ti·∫øp ƒë·ªÉ ƒë·ªôc l·∫≠p v·ªõi Cell 1) ---
def calculate_features_polars(df_pl):
    df_pl = df_pl.sort("frame")
    
    # --- GIAI ƒêO·∫†N 1: C√ÅC ƒê·∫†I L∆Ø·ª¢NG C∆† B·∫¢N (BASICS) ---
    exprs_basic = [
        # Distance: Body Center to Body Center
        (((pl.col("mouse1_body_center_x") - pl.col("mouse2_body_center_x"))**2 + 
          (pl.col("mouse1_body_center_y") - pl.col("mouse2_body_center_y"))**2).sqrt()).fill_null(0).alias("distance"),
        
        # Velocity Magnitude (Speed)
        (((pl.col("mouse1_body_center_x").diff().fill_null(0))**2 + 
          (pl.col("mouse1_body_center_y").diff().fill_null(0))**2).sqrt()).alias("velocity_m1"),
        (((pl.col("mouse2_body_center_x").diff().fill_null(0))**2 + 
          (pl.col("mouse2_body_center_y").diff().fill_null(0))**2).sqrt()).alias("velocity_m2"),
          
        # Social Distances
        (((pl.col("mouse1_nose_x") - pl.col("mouse2_tail_base_x"))**2 + 
          (pl.col("mouse1_nose_y") - pl.col("mouse2_tail_base_y"))**2).sqrt()).fill_null(0).alias("nose1_to_tail2"),
        (((pl.col("mouse1_nose_x") - pl.col("mouse2_nose_x"))**2 + 
          (pl.col("mouse1_nose_y") - pl.col("mouse2_nose_y"))**2).sqrt()).fill_null(0).alias("nose1_to_nose2"),
    ]
    df_pl = df_pl.with_columns(exprs_basic)

    # --- GIAI ƒêO·∫†N 2: T√çNH TO√ÅN VECTOR & HELPER COLUMNS ---
    # T·∫°o c√°c c·ªôt vector ƒë·ªÉ d√πng cho t√≠nh to√°n ph·ª©c t·∫°p ph√≠a sau
    df_pl = df_pl.with_columns([
        # Vector V·∫≠n t·ªëc (Velocity Vector)
        pl.col("mouse1_body_center_x").diff().fill_null(0).alias("vx1"),
        pl.col("mouse1_body_center_y").diff().fill_null(0).alias("vy1"),
        pl.col("mouse2_body_center_x").diff().fill_null(0).alias("vx2"),
        pl.col("mouse2_body_center_y").diff().fill_null(0).alias("vy2"),
        
        # Vector X∆∞∆°ng S·ªëng (Spine Vector): T·ª´ M≈©i -> ƒêu√¥i (D√πng thay cho Neck ƒë·ªÉ an to√†n d·ªØ li·ªáu thi·∫øu)
        (pl.col("mouse1_tail_base_x") - pl.col("mouse1_nose_x")).alias("spine1_x"),
        (pl.col("mouse1_tail_base_y") - pl.col("mouse1_nose_y")).alias("spine1_y"),
        (pl.col("mouse2_tail_base_x") - pl.col("mouse2_nose_x")).alias("spine2_x"),
        (pl.col("mouse2_tail_base_y") - pl.col("mouse2_nose_y")).alias("spine2_y"),

        # Vector M≈©i t∆∞∆°ng ƒë·ªëi (Relative Nose): M≈©i so v·ªõi Tr·ªçng t√¢m (ƒê·ªÉ b·∫Øt Grooming)
        (pl.col("mouse1_nose_x") - pl.col("mouse1_body_center_x")).alias("nose1_rel_x"),
        (pl.col("mouse1_nose_y") - pl.col("mouse1_body_center_y")).alias("nose1_rel_y"),
    ])
    
    # T√≠nh Vector Gia t·ªëc (Acceleration Vector) t·ª´ V·∫≠n t·ªëc
    df_pl = df_pl.with_columns([
        pl.col("vx1").diff().fill_null(0).alias("ax1"),
        pl.col("vy1").diff().fill_null(0).alias("ay1"),
        pl.col("vx2").diff().fill_null(0).alias("ax2"),
        pl.col("vy2").diff().fill_null(0).alias("ay2")
    ])
    
    # --- GIAI ƒêO·∫†N 3: PLATINUM FEATURES (LOGIC PH·ª®C T·∫†P) ---
    
    # 1. Spine Alignment & Elongation (ƒê·ªô th·∫≥ng h√†ng & ƒê·ªô d√†i c∆° th·ªÉ)
    norm_spine1 = (pl.col("spine1_x")**2 + pl.col("spine1_y")**2).sqrt()
    norm_spine2 = (pl.col("spine2_x")**2 + pl.col("spine2_y")**2).sqrt()
    dot_spine = pl.col("spine1_x") * pl.col("spine2_x") + pl.col("spine1_y") * pl.col("spine2_y")
    
    # 2. Micro-Motion (Grooming Indicator)
    # T√≠nh v·∫≠n t·ªëc l·∫Øc l∆∞ c·ªßa m≈©i
    nose_speed_rel = ((pl.col("nose1_rel_x").diff().fill_null(0))**2 + 
                      (pl.col("nose1_rel_y").diff().fill_null(0))**2).sqrt()

    # 3. Facing Angle
    v1_x = pl.col('mouse1_nose_x') - pl.col('mouse1_body_center_x')
    v1_y = pl.col('mouse1_nose_y') - pl.col('mouse1_body_center_y')
    v12_x = pl.col('mouse2_body_center_x') - pl.col('mouse1_body_center_x')
    v12_y = pl.col('mouse2_body_center_y') - pl.col('mouse1_body_center_y')
    dot_face = v1_x * v12_x + v1_y * v12_y
    norm_face1 = (v1_x**2 + v1_y**2).sqrt()
    norm_face12 = (v12_x**2 + v12_y**2).sqrt()

    # 4. Turn Rate (G√≥c di chuy·ªÉn)
    move_angle1 = pl.arctan2(pl.col("vy1"), pl.col("vx1"))
    move_angle2 = pl.arctan2(pl.col("vy2"), pl.col("vx2"))

    # --- T·ªîNG H·ª¢P FEATURES ---
    df_pl = df_pl.with_columns([
        # Acceleration Magnitude
        pl.col("velocity_m1").diff().fill_null(0).alias("accel_m1"),
        pl.col("velocity_m2").diff().fill_null(0).alias("accel_m2"),
        
        # [NEW] Spine Alignment: -1 (ƒê·ªëi ƒë·∫ßu), 1 (Song song c√πng chi·ªÅu - Mount)
        (dot_spine / (norm_spine1 * norm_spine2 + 1e-6)).fill_nan(0).alias("spine_alignment"),
        
        # [NEW] Elongation: ƒê·ªô d√†i c∆° th·ªÉ (Attack d∆∞·ªõn ng∆∞·ªùi vs Huddle co ng∆∞·ªùi)
        norm_spine1.alias("elongation_m1"),
        norm_spine2.alias("elongation_m2"),
        
        # [NEW] Grooming Score: M≈©i di chuy·ªÉn nhi·ªÅu / Th√¢n di chuy·ªÉn √≠t
        (nose_speed_rel / (pl.col("velocity_m1") + 0.5)).fill_nan(0).alias("grooming_score_m1"),

        # Facing Angle
        (dot_face / (norm_face1 * norm_face12 + 1e-6)).fill_nan(0).fill_null(0).alias('facing_angle_m1'),
        
        # Curvature
        ( (pl.col("vx1")*pl.col("ay1") - pl.col("vy1")*pl.col("ax1")).abs() / 
          ((pl.col("vx1")**2 + pl.col("vy1")**2 + 1e-6)**1.5) ).fill_nan(0).alias("curvature_m1"),
        ( (pl.col("vx2")*pl.col("ay2") - pl.col("vy2")*pl.col("ax2")).abs() / 
          ((pl.col("vx2")**2 + pl.col("vy2")**2 + 1e-6)**1.5) ).fill_nan(0).alias("curvature_m2"),

        # Turn Rate & Speed Ratio
        move_angle1.diff().abs().fill_null(0).alias("turn_rate_m1"),
        move_angle2.diff().abs().fill_null(0).alias("turn_rate_m2"),
        (pl.col("velocity_m1") / (pl.col("velocity_m1").rolling_mean(30).fill_null(0) + 1e-6)).alias("speed_ratio_m1")
    ])
    
    # --- GIAI ƒêO·∫†N 4: ROLLING WINDOWS (CONTEXT & JITTER) ---
    roll_exprs = [
        # Context (Mean 10 frames)
        pl.col("distance").rolling_mean(10).fill_null(0).alias("dist_mean_10"),
        pl.col("velocity_m1").rolling_mean(10).fill_null(0).alias("vel_m1_mean_10"),
        pl.col("velocity_m2").rolling_mean(10).fill_null(0).alias("vel_m2_mean_10"),
        pl.col("facing_angle_m1").rolling_mean(10).fill_null(0).alias("facing_mean_10"),
        pl.col("curvature_m1").rolling_mean(10).fill_null(0).alias("curvature_m1_mean_10"),
        
        # [NEW] Jitter / Tremor (Standard Deviation)
        # Feature n√†y c·ª±c m·∫°nh ƒë·ªÉ b·∫Øt h√†nh vi Grooming (ƒë·ª©ng y√™n nh∆∞ng rung)
        pl.col("velocity_m1").rolling_std(10).fill_null(0).alias("vel_m1_std_10"),
        pl.col("velocity_m2").rolling_std(10).fill_null(0).alias("vel_m2_std_10"),
    ]
    
    df_pl = df_pl.with_columns(roll_exprs)
    return df_pl

def standardize_tracking_data(df_wide, mouse_ids):
    for m_id in mouse_ids:
        prefix = f"mouse{m_id}_"
        center_x = f"{prefix}body_center_x"; center_y = f"{prefix}body_center_y"
        if center_x not in df_wide.columns:
            x_cols = [c for c in df_wide.columns if c.startswith(prefix) and c.endswith('_x')]
            y_cols = [c for c in df_wide.columns if c.startswith(prefix) and c.endswith('_y')]
            if x_cols:
                df_wide[center_x] = df_wide[x_cols].mean(axis=1)
                df_wide[center_y] = df_wide[y_cols].mean(axis=1)
            else:
                df_wide[center_x] = 0.0; df_wide[center_y] = 0.0
        for part in ['nose', 'tail_base']:
            px = f"{prefix}{part}_x"; py = f"{prefix}{part}_y"
            if px not in df_wide.columns:
                df_wide[px] = df_wide[center_x]; df_wide[py] = df_wide[center_y]
    return df_wide

# --- MAIN INFERENCE LOOP ---
def run_global_inference():
    print("üöÄ Starting Global Inference (Platinum Features)...")
    
    # 1. LOAD MODELS
    models = []
    
    print(f"üìÇ Searching models in: {MODEL_PATH}")
    
    # A. Load XGBoost (L·∫•y t·∫•t c·∫£ file .pkl ngo·∫°i tr·ª´ metadata)
    xgb_files = glob.glob(f'{MODEL_PATH}/*.pkl')
    # L·ªçc b·ªè c√°c file kh√¥ng ph·∫£i model
    xgb_files = [f for f in xgb_files if 'label_encoder' not in f and 'threshold' not in f and 'feature_names' not in f]
    
    if xgb_files:
        print(f"   üîç Found {len(xgb_files)} XGBoost models.")
        for f in xgb_files:
            try:
                model = joblib.load(f)
                models.append(('xgb', model))
                print(f"     Loaded: {os.path.basename(f)}")
            except:
                print(f"     ‚ùå Failed to load: {os.path.basename(f)}")
    else:
        print("   ‚ö†Ô∏è No XGBoost models found.")

    # B. Load CatBoost (L·∫•y t·∫•t c·∫£ file .cbm)
    cat_files = glob.glob(f'{MODEL_PATH}/*.cbm')
    
    if cat_files:
        print(f"   üîç Found {len(cat_files)} CatBoost models.")
        for f in cat_files:
            try:
                clf = CatBoostClassifier()
                clf.load_model(f)
                models.append(('cat', clf))
                print(f"     Loaded: {os.path.basename(f)}")
            except:
                print(f"     ‚ùå Failed to load: {os.path.basename(f)}")
    else:
        print("   ‚ö†Ô∏è No CatBoost models found.")
            
    if not models:
        print("‚ùå CRITICAL: Kh√¥ng load ƒë∆∞·ª£c b·∫•t k·ª≥ model n√†o!")
        return
        
    print(f"‚úÖ TOTAL MODELS LOADED: {len(models)}")

    # Load Encoder & Thresholds
    try:
        le = joblib.load(f'{MODEL_PATH}/label_encoder.pkl')
        # [FIX] Load th√™m best_thresholds v√†o bi·∫øn best_ths
        best_ths = joblib.load(f'{MODEL_PATH}/best_thresholds.pkl') 
        print(f"‚úÖ Configs loaded. Thresholds: {len(best_ths)} classes")
    except Exception as e:
        print(f"‚ùå CRITICAL ERROR: Kh√¥ng load ƒë∆∞·ª£c configs! L·ªói: {e}")
        return

    # [FIX] T√¨m index c·ªßa class 'other' ƒë·ªÉ d√πng cho logic safe-check sau n√†y
    try:
        other_idx = le.transform(['other'])[0]
    except:
        other_idx = -1

    # 2. INIT SUBMISSION
    if os.path.exists(SUBMISSION_FILE): os.remove(SUBMISSION_FILE)
    pd.DataFrame(columns=['row_id', 'video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame']).to_csv(SUBMISSION_FILE, index=False)
    
    test_meta = pd.read_csv(f'{TEST_DATA_PATH}/test.csv')
    row_id_counter = 0
    
    # 3. PROCESS VIDEOS
    for idx, row in tqdm(test_meta.iterrows(), total=len(test_meta), desc="Inferencing"):
        vid = row['video_id']; lab = row['lab_id']
        pix_per_cm = row['pix_per_cm_approx'] if row['pix_per_cm_approx'] > 0 else 1.0
        
        t_path = f'{TEST_DATA_PATH}/test_tracking/{lab}/{vid}.parquet'
        if not os.path.exists(t_path): continue
        
        try:
            # ETL
            track_df = pd.read_parquet(t_path)
            track_df['col_name'] = 'mouse' + track_df['mouse_id'].astype(str) + '_' + track_df['bodypart']
            px = track_df.pivot(index='video_frame', columns='col_name', values='x')
            py = track_df.pivot(index='video_frame', columns='col_name', values='y')
            px.columns = [c + '_x' for c in px.columns]; py.columns = [c + '_y' for c in py.columns]
            df_wide = pd.concat([px, py], axis=1).sort_index()
            
            mouse_ids = sorted(list(set([int(c.split('_')[0].replace('mouse', '')) for c in px.columns if 'mouse' in c])))
            df_wide = standardize_tracking_data(df_wide, mouse_ids)
            
            # [FIX] Gi·∫£m limit v·ªÅ 5 ƒë·ªÉ kh·ªõp v·ªõi l√∫c Train, tr√°nh t·∫°o d·ªØ li·ªáu gi·∫£
            df_wide = df_wide.interpolate(limit=5) 
            # [FIX] Fill tu·∫ßn t·ª± an to√†n h∆°n
            df_wide = df_wide.fillna(method='ffill').fillna(method='bfill').fillna(0)
            
            df_wide = df_wide / pix_per_cm
            df_wide = df_wide.reset_index().rename(columns={'video_frame': 'frame'})
            
            pl_wide = pl.from_pandas(df_wide)
            video_submissions = []
            
            import itertools
            for m1, m2 in itertools.permutations(mouse_ids, 2):
                col_map = {}
                for c in pl_wide.columns:
                    if f'mouse{m1}_' in c: col_map[c] = c.replace(f'mouse{m1}_', 'mouse1_')
                    elif f'mouse{m2}_' in c: col_map[c] = c.replace(f'mouse{m2}_', 'mouse2_')
                
                pair_pl = pl_wide.select(['frame'] + list(col_map.keys())).rename(col_map)
                pair_pl = calculate_features_polars(pair_pl)
                X_test = pair_pl.select(FEATURE_COLS).to_pandas().astype(np.float32)
                
                # PREDICT (Weighted Soft Voting)
                # Tr·ªçng s·ªë: 0.5 XGB + 0.5 Cat (C√¢n b·∫±ng)
                # N·∫øu load nhi·ªÅu folds, chia ƒë·ªÅu
                final_probs = np.zeros((len(X_test), len(le.classes_)))
                weight_per_model = 1.0 / len(models)
                
                for name, m in models:
                    probs = m.predict_proba(X_test)
                    final_probs += probs * weight_per_model
                
                # [FIX] Logic Safe-Thresholding: Check tr·ª±c ti·∫øp thay v√¨ chia
                y_pred_idx = np.argmax(final_probs, axis=1)
                
                # Force v·ªÅ 'other' n·∫øu x√°c su·∫•t max kh√¥ng v∆∞·ª£t qua threshold ri√™ng c·ªßa class ƒë√≥
                for i in range(len(final_probs)):
                    pred_class = y_pred_idx[i]
                    
                    # N·∫øu d·ª± ƒëo√°n l√† h√†nh vi (kh√¥ng ph·∫£i other)
                    if pred_class != other_idx:
                        # L·∫•y threshold c·ªßa h√†nh vi ƒë√≥ (default 0.3 n·∫øu kh√¥ng t√¨m th·∫•y)
                        th = best_ths.get(pred_class, 0.3) 
                        
                        # N·∫øu x√°c su·∫•t kh√¥ng ƒë·ªß t·ª± tin -> G√°n v·ªÅ 'other'
                        if final_probs[i, pred_class] < th:
                            y_pred_idx[i] = other_idx
                
                # POST-PROCESS (Min Duration Filter)
                frames = pair_pl['frame'].to_numpy()
                current_action = None; start_f = 0
                
                for i, pred_idx in enumerate(y_pred_idx):
                    action = le.classes_[pred_idx]
                    if action != current_action:
                        if current_action is not None and current_action != 'other':
                            if (i - start_f) >= 2: # L·ªçc nhi·ªÖu < 2 frames
                                video_submissions.append({
                                    'row_id': -1, 'video_id': vid,
                                    'agent_id': f'mouse{m1}', 'target_id': f'mouse{m2}',
                                    'action': current_action, 'start_frame': frames[start_f], 'stop_frame': frames[i-1]
                                })
                        current_action = action; start_f = i
                
                if current_action is not None and current_action != 'other':
                     if (len(y_pred_idx) - start_f) >= 2:
                        video_submissions.append({
                            'row_id': -1, 'video_id': vid,
                            'agent_id': f'mouse{m1}', 'target_id': f'mouse{m2}',
                            'action': current_action, 'start_frame': frames[start_f], 'stop_frame': frames[-1]
                        })

            if video_submissions:
                batch_df = pd.DataFrame(video_submissions)
                batch_df['row_id'] = range(row_id_counter, row_id_counter + len(batch_df))
                row_id_counter += len(batch_df)
                batch_df.to_csv(SUBMISSION_FILE, mode='a', header=False, index=False)
            
            del track_df, df_wide, pl_wide, X_test, final_probs
            gc.collect()

        except Exception as e:
            print(f"‚ö†Ô∏è Err {vid}: {e}")
            continue

    print(f"‚úÖ DONE! Generated {row_id_counter} predictions.")

# Ch·∫°y
run_global_inference()

‚úÖ ƒê∆∞·ªùng d·∫´n model OK: /kaggle/input/model-testdata/models
   T√¨m th·∫•y 5 file .pkl
üöÄ Starting Global Inference (Platinum Features)...
üìÇ Searching models in: /kaggle/input/model-testdata/models
   üîç Found 3 XGBoost models.
     Loaded: xgb_seed42.pkl
     Loaded: xgb_seed2023.pkl
     Loaded: xgb_seed9999.pkl
   üîç Found 3 CatBoost models.
     Loaded: cat_seed2023.cbm
     Loaded: cat_seed9999.cbm
     Loaded: cat_seed42.cbm
‚úÖ TOTAL MODELS LOADED: 6
‚úÖ Configs loaded. Thresholds: 27 classes


Inferencing:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ DONE! Generated 1201 predictions.
