In [None]:
import pandas as pd
import numpy as np
import os
import gc
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
import itertools

# --- 1. C·∫§U H√åNH ---
DATA_PATH = '/kaggle/input/MABe-mouse-behavior-detection/'
NUM_VIDEOS_TRAIN = 50 
MIN_DURATION_FRAMES = 3  # L·ªçc b·ªè c√°c h√†nh ƒë·ªông ng·∫Øn h∆°n 3 frame (kho·∫£ng 0.1s)

# --- 2. H√ÄM FEATURE ENGINEERING N√ÇNG CAO (PHYSICS + SOCIAL) ---
def calculate_features_safe(df):
    # Kh·ªüi t·∫°o m·∫∑c ƒë·ªãnh
    df['distance'] = 0.0
    df['velocity_m1'] = 0.0
    df['velocity_m2'] = 0.0
    
    # 1. T√≠nh Kho·∫£ng c√°ch c∆° b·∫£n
    if 'mouse1_body_center_x' in df.columns and 'mouse2_body_center_x' in df.columns:
        dx = df['mouse1_body_center_x'] - df['mouse2_body_center_x']
        dy = df['mouse1_body_center_y'] - df['mouse2_body_center_y']
        df['distance'] = np.sqrt(dx**2 + dy**2)
    
    # 2. T√≠nh V·∫≠n t·ªëc M1
    if 'mouse1_body_center_x' in df.columns:
        vx1 = df['mouse1_body_center_x'].diff().fillna(0)
        vy1 = df['mouse1_body_center_y'].diff().fillna(0)
        df['velocity_m1'] = np.sqrt(vx1**2 + vy1**2)
        
    # 3. T√≠nh V·∫≠n t·ªëc M2
    if 'mouse2_body_center_x' in df.columns:
        vx2 = df['mouse2_body_center_x'].diff().fillna(0)
        vy2 = df['mouse2_body_center_y'].diff().fillna(0)
        df['velocity_m2'] = np.sqrt(vx2**2 + vy2**2)
        
    # 4. T√≠nh Gia t·ªëc (Acceleration) - NEW
    df['accel_m1'] = df['velocity_m1'].diff().fillna(0)
    df['accel_m2'] = df['velocity_m2'].diff().fillna(0)

    # 5. Social Feature: Kho·∫£ng c√°ch M≈©i - ƒêu√¥i (Quan tr·ªçng cho Chase/Sniff Genital) - NEW
    if 'mouse1_nose_x' in df.columns and 'mouse2_tail_base_x' in df.columns:
        df['nose1_to_tail2'] = np.sqrt(
            (df['mouse1_nose_x'] - df['mouse2_tail_base_x'])**2 + 
            (df['mouse1_nose_y'] - df['mouse2_tail_base_y'])**2
        )
    else:
        df['nose1_to_tail2'] = 0.0

    # 6. Social Feature: Kho·∫£ng c√°ch M≈©i - M≈©i (Quan tr·ªçng cho Sniff Face) - NEW
    if 'mouse1_nose_x' in df.columns and 'mouse2_nose_x' in df.columns:
        df['nose1_to_nose2'] = np.sqrt(
            (df['mouse1_nose_x'] - df['mouse2_nose_x'])**2 + 
            (df['mouse1_nose_y'] - df['mouse2_nose_y'])**2
        )
    else:
        df['nose1_to_nose2'] = 0.0

    # 7. Social Feature: Facing Angle (Chu·ªôt 1 c√≥ nh√¨n v√†o Chu·ªôt 2 kh√¥ng?) - NEW
    if 'mouse1_nose_x' in df.columns and 'mouse1_body_center_x' in df.columns and 'mouse2_body_center_x' in df.columns:
        # Vector c∆° th·ªÉ chu·ªôt 1
        vec1_x = df['mouse1_nose_x'] - df['mouse1_body_center_x']
        vec1_y = df['mouse1_nose_y'] - df['mouse1_body_center_y']
        # Vector t·ª´ Chu·ªôt 1 -> Chu·ªôt 2
        vec12_x = df['mouse2_body_center_x'] - df['mouse1_body_center_x']
        vec12_y = df['mouse2_body_center_y'] - df['mouse1_body_center_y']
        
        dot_product = vec1_x * vec12_x + vec1_y * vec12_y
        norm1 = np.sqrt(vec1_x**2 + vec1_y**2)
        norm12 = np.sqrt(vec12_x**2 + vec12_y**2)
        df['facing_angle_m1'] = dot_product / (norm1 * norm12 + 1e-6)
    else:
        df['facing_angle_m1'] = 0.0

    # 8. Rolling Stats (Memory)
    w = 10
    cols_to_roll = ['distance', 'velocity_m1', 'velocity_m2', 'accel_m1', 'nose1_to_tail2', 'facing_angle_m1']
    for col in cols_to_roll:
        df[f'{col}_mean_{w}'] = df[col].rolling(window=w).mean().fillna(0)
    
    return df.fillna(0)

# C·∫≠p nh·∫≠t danh s√°ch features
features = [
    'distance', 'velocity_m1', 'velocity_m2', 
    'accel_m1', 'accel_m2', 
    'nose1_to_tail2', 'nose1_to_nose2', 'facing_angle_m1',
    'distance_mean_10', 'velocity_m1_mean_10', 'velocity_m2_mean_10', 
    'accel_m1_mean_10', 'nose1_to_tail2_mean_10', 'facing_angle_m1_mean_10'
]

# --- 3. H√ÄM LOAD DATA (GI·ªÆ NGUY√äN LOGIC, CH·ªà G·ªåI FEATURE M·ªöI) ---
def load_train_data_all_pairs(meta_row):
    try:
        video_id = meta_row['video_id']
        lab_id = meta_row['lab_id']
        pix_per_cm = meta_row['pix_per_cm_approx'] if meta_row['pix_per_cm_approx'] > 0 else 1.0
        
        t_path = os.path.join(DATA_PATH, 'train_tracking', lab_id, f'{video_id}.parquet')
        a_path = os.path.join(DATA_PATH, 'train_annotation', lab_id, f'{video_id}.parquet')
        
        if not os.path.exists(t_path) or not os.path.exists(a_path): return None
        
        df_track = pd.read_parquet(t_path)
        px = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='x')
        px.columns = [f"mouse{m}_{bp}_x" for m, bp in px.columns]
        py = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='y')
        py.columns = [f"mouse{m}_{bp}_y" for m, bp in py.columns]
        df_wide = pd.concat([px, py], axis=1).sort_index(axis=1)
        
        df_wide = df_wide.interpolate(limit=5).fillna(0)
        df_wide = df_wide / pix_per_cm 
        
        df_annot = pd.read_parquet(a_path)
        
        mouse_ids = sorted(list(set([int(c.split('_')[0].replace('mouse', '')) for c in df_wide.columns if 'mouse' in c])))
        pairs_data = []
        
        for m1, m2 in itertools.combinations(mouse_ids, 2):
            cols1 = [c for c in df_wide.columns if f'mouse{m1}_' in c]
            cols2 = [c for c in df_wide.columns if f'mouse{m2}_' in c]
            if not cols1 or not cols2: continue

            df_pair = df_wide[cols1 + cols2].copy()
            rename_dict = {}
            for c in cols1: rename_dict[c] = c.replace(f'mouse{m1}_', 'mouse1_')
            for c in cols2: rename_dict[c] = c.replace(f'mouse{m2}_', 'mouse2_')
            df_pair.rename(columns=rename_dict, inplace=True)
            
            df_pair['label'] = 'other'
            mask = ((df_annot['agent_id'] == m1) & (df_annot['target_id'] == m2)) | \
                   ((df_annot['agent_id'] == m2) & (df_annot['target_id'] == m1))
            
            pair_annot = df_annot[mask]
            for _, r in pair_annot.iterrows():
                if r['stop_frame'] <= df_pair.index.max():
                    df_pair.loc[r['start_frame']:r['stop_frame'], 'label'] = r['action']
                    
            pairs_data.append(df_pair)
            
        if pairs_data:
            return pd.concat(pairs_data, ignore_index=True)
    except Exception as e:
        return None
    return None

# --- 4. CHU·∫®N B·ªä D·ªÆ LI·ªÜU (B·ªé DOWNSAMPLING 1:1) ---
print("‚è≥ ƒêang t·∫°o d·ªØ li·ªáu 'All-Pairs' t·ª´ 50 video...")
try:
    df_meta = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
except:
    df_meta = pd.DataFrame()

all_train_dfs = []
for i in tqdm(range(min(NUM_VIDEOS_TRAIN, len(df_meta)))):
    df = load_train_data_all_pairs(df_meta.iloc[i])
    if df is not None:
        df = calculate_features_safe(df) 
        all_train_dfs.append(df[features + ['label']])

if len(all_train_dfs) > 0:
    df_train_big = pd.concat(all_train_dfs, ignore_index=True)
    del all_train_dfs
    gc.collect()

    print(f"‚úÖ D·ªØ li·ªáu th√¥: {df_train_big.shape}")
    
    # --- 5. S·ª¨A L·ªñI CLASS IMBALANCE ---
    # Thay v√¨ v·ª©t b·ªè d·ªØ li·ªáu 'other', ch√∫ng ta d√πng to√†n b·ªô ho·∫∑c downsample nh·∫π (vd: 3:1)
    # ·ªû ƒë√¢y d√πng to√†n b·ªô ƒë·ªÉ t·ªëi ƒëa h√≥a th√¥ng tin, LightGBM s·∫Ω x·ª≠ l√Ω b·∫±ng Class Weights
    print("‚öñÔ∏è Kh√¥ng downsample th√¥ b·∫°o. S·ª≠ d·ª•ng Class Weights...")
    
    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(df_train_big['label'])
    X_train = df_train_big[features]
    
    # T√≠nh to√°n Class Weights t·ª± ƒë·ªông
    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    weight_dict = dict(zip(classes, weights))
    print(f"‚öñÔ∏è Tr·ªçng s·ªë l·ªõp (Class Weights): {weight_dict}")
    
    # --- 6. HU·∫§N LUY·ªÜN LIGHTGBM (UPDATE PARAMS) ---
    print("üöÄ ƒêang hu·∫•n luy·ªán LightGBM v·ªõi Class Weights...")
    params = {
        'objective': 'multiclass',
        'num_class': len(le.classes_),
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'random_state': 42,
        'learning_rate': 0.05,
        'n_estimators': 1000,        # TƒÉng s·ªë c√¢y v√¨ d·ªØ li·ªáu nhi·ªÅu h∆°n
        'class_weight': weight_dict, # <--- QUAN TR·ªåNG NH·∫§T: X·ª≠ l√Ω m·∫•t c√¢n b·∫±ng
        'colsample_bytree': 0.8,     # Subsampling ƒë·ªÉ ch·ªëng overfit
        'subsample': 0.8,
        'verbosity': -1,
        'device': 'gpu',           # <--- TH√äM D√íNG N√ÄY
    'gpu_platform_id': 0,
    'gpu_device_id': 0
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    print("üéâ Hu·∫•n luy·ªán xong!")

else:
    print("‚ùå Kh√¥ng load ƒë∆∞·ª£c d·ªØ li·ªáu n√†o.")

# --- 7. H√ÄM POST-PROCESSING (UPDATE: MIN DURATION) ---
def run_length_encoding_pro(predictions, agent_id, target_id):
    events = []
    if len(predictions) == 0: return events
    
    current_label = predictions[0]
    start_frame = 0
    
    for i in range(1, len(predictions)):
        if predictions[i] != current_label:
            # Ch·ªâ l∆∞u n·∫øu kh√¥ng ph·∫£i 'other' V√Ä ƒë·ªô d√†i h√†nh ƒë·ªông ƒë·ªß l·ªõn (gi·∫£m nhi·ªÖu)
            if current_label != 'other' and (i - start_frame) >= MIN_DURATION_FRAMES:
                events.append({
                    'agent_id': agent_id,
                    'target_id': target_id,
                    'action': current_label,
                    'start_frame': start_frame,
                    'stop_frame': i - 1
                })
            current_label = predictions[i]
            start_frame = i
            
    # X·ª≠ l√Ω ƒëo·∫°n cu·ªëi c√πng
    if current_label != 'other' and (len(predictions) - start_frame) >= MIN_DURATION_FRAMES:
        events.append({
            'agent_id': agent_id,
            'target_id': target_id,
            'action': current_label,
            'start_frame': start_frame,
            'stop_frame': len(predictions) - 1
        })
    return events

# --- 8. T·∫†O SUBMISSION ---
if 'model' in locals():
    print("üìù ƒêang t·∫°o submission...")
    try:
        df_test_meta = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
    except: df_test_meta = pd.DataFrame()

    submission_rows = []
    row_id_counter = 0

    for idx, row in tqdm(df_test_meta.iterrows(), total=len(df_test_meta)):
        try:
            video_id = row['video_id']
            lab_id = row['lab_id']
            pix_per_cm = row['pix_per_cm_approx'] if row['pix_per_cm_approx'] > 0 else 1.0
            
            t_path = os.path.join(DATA_PATH, 'test_tracking', lab_id, f'{video_id}.parquet')
            if not os.path.exists(t_path): continue
            
            df_track = pd.read_parquet(t_path)
            px = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='x')
            px.columns = [f"mouse{m}_{bp}_x" for m, bp in px.columns]
            py = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='y')
            py.columns = [f"mouse{m}_{bp}_y" for m, bp in py.columns]
            df_wide = pd.concat([px, py], axis=1).sort_index(axis=1)
            
            df_wide = df_wide.interpolate(limit=5).fillna(0)
            df_wide = df_wide / pix_per_cm
            
            mouse_ids = sorted(list(set([int(c.split('_')[0].replace('mouse', '')) for c in df_wide.columns if 'mouse' in c])))
            
            for m1, m2 in itertools.permutations(mouse_ids, 2):
                cols1 = [c for c in df_wide.columns if f'mouse{m1}_' in c]
                cols2 = [c for c in df_wide.columns if f'mouse{m2}_' in c]
                if not cols1 or not cols2: continue
                
                df_pair = df_wide[cols1 + cols2].copy()
                rename_dict = {}
                for c in cols1: rename_dict[c] = c.replace(f'mouse{m1}_', 'mouse1_')
                for c in cols2: rename_dict[c] = c.replace(f'mouse{m2}_', 'mouse2_')
                df_pair.rename(columns=rename_dict, inplace=True)
                
                # Feature Engineering
                df_pair = calculate_features_safe(df_pair)
                
                X_test = pd.DataFrame(0.0, index=df_pair.index, columns=features)
                for c in features:
                    if c in df_pair.columns: X_test[c] = df_pair[c]
                
                # Predict
                y_pred_idx = model.predict(X_test)
                y_pred_lbl = le.inverse_transform(y_pred_idx)
                
                agent_str = f"mouse{m1}"
                target_str = f"mouse{m2}"
                
                # Post-processing (Run Length Encoding + Smoothing)
                events = run_length_encoding_pro(y_pred_lbl, agent_str, target_str)
                
                for event in events:
                    submission_rows.append({
                        'row_id': row_id_counter,
                        'video_id': video_id,
                        'agent_id': event['agent_id'],
                        'target_id': event['target_id'],
                        'action': event['action'],
                        'start_frame': event['start_frame'],
                        'stop_frame': event['stop_frame']
                    })
                    row_id_counter += 1
                    
            del df_wide
            gc.collect()
        except Exception as e:
            print(f"Error processing test video {video_id}: {e}")
            continue

    # Save submission
    sub = pd.DataFrame(submission_rows)
    if len(sub) == 0:
        sub = pd.DataFrame(columns=['row_id', 'video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame'])

    sub.to_csv('submission.csv', index=False)
    print(f"‚úÖ ƒê√£ t·∫°o submission.csv M·ªöI v·ªõi {len(sub)} d√≤ng d·ª± ƒëo√°n.")
    display(sub.head())