In [1]:
import pandas as pd
import numpy as np
import os
from tqdm.auto import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import resample

# --- 1. C·∫§U H√åNH ---
DATA_PATH = '/kaggle/input/MABe-mouse-behavior-detection/'
NUM_VIDEOS_TO_TRAIN = 20 # S·ªë l∆∞·ª£ng video d√πng ƒë·ªÉ train (C√†ng nhi·ªÅu c√†ng t·ªët)

# ƒê·ªçc metadata
try:
    df_train_meta = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
except FileNotFoundError:
    print("‚ö†Ô∏è L·ªói: Kh√¥ng t√¨m th·∫•y file train.csv")

# --- 2. H√ÄM T·∫†O ƒê·∫∂C TR∆ØNG (FEATURE ENGINEERING) ---
def calculate_features_with_memory(df):
    # a. T√≠nh V·∫≠t l√Ω (Distance & Velocity)
    try:
        dx = df['mouse1_body_center_x'] - df['mouse2_body_center_x']
        dy = df['mouse1_body_center_y'] - df['mouse2_body_center_y']
        df['distance'] = np.sqrt(dx**2 + dy**2)
    except KeyError:
        df['distance'] = 0 
        
    vx = df['mouse1_body_center_x'].diff().fillna(0)
    vy = df['mouse1_body_center_y'].diff().fillna(0)
    df['velocity_m1'] = np.sqrt(vx**2 + vy**2)
    
    try:
        vx2 = df['mouse2_body_center_x'].diff().fillna(0)
        vy2 = df['mouse2_body_center_y'].diff().fillna(0)
        df['velocity_m2'] = np.sqrt(vx2**2 + vy2**2)
    except KeyError:
        df['velocity_m2'] = 0
        
    # b. T·∫°o K√Ω ·ª©c (Rolling Window - 10 frames)
    w = 10
    df['dist_mean_10'] = df['distance'].rolling(window=w).mean().fillna(0)
    df['dist_std_10'] = df['distance'].rolling(window=w).std().fillna(0)
    df['vel1_mean_10'] = df['velocity_m1'].rolling(window=w).mean().fillna(0)
    df['vel2_mean_10'] = df['velocity_m2'].rolling(window=w).mean().fillna(0)
    
    return df

# --- 3. H√ÄM LOAD & L·ªåC D·ªÆ LI·ªÜU (CH·ªà L·∫§Y C·∫∂P 1-2 ƒê·ªÇ TRAIN) ---
def get_train_data(idx):
    row = df_train_meta.iloc[idx]
    lab_id, video_id = row['lab_id'], row['video_id']
    pix_per_cm = row['pix_per_cm_approx'] if row['pix_per_cm_approx'] > 0 else 1.0
    
    # Load Tracking
    t_path = os.path.join(DATA_PATH, 'train_tracking', lab_id, f'{video_id}.parquet')
    a_path = os.path.join(DATA_PATH, 'train_annotation', lab_id, f'{video_id}.parquet')
    
    try:
        df_track = pd.read_parquet(t_path)
    except FileNotFoundError: return None

    # Pivot & Chu·∫©n h√≥a CM
    px = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='x')
    px.columns = [f"mouse{m}_{bp}_x" for m, bp in px.columns]
    py = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='y')
    py.columns = [f"mouse{m}_{bp}_y" for m, bp in py.columns]
    df_wide = pd.concat([px, py], axis=1).sort_index(axis=1)
    df_wide = df_wide / pix_per_cm 

    # Load Annotation & L·ªçc ch·ªâ l·∫•y t∆∞∆°ng t√°c c·∫∑p 1-2
    try:
        df_annot = pd.read_parquet(a_path)
        df_wide['label'] = 'other'
        # Ch·ªâ l·∫•y d√≤ng c√≥ agent=1, target=2 HO·∫∂C agent=2, target=1
        mask = ((df_annot['agent_id'] == 1) & (df_annot['target_id'] == 2)) | \
               ((df_annot['agent_id'] == 2) & (df_annot['target_id'] == 1))
        pair_annot = df_annot[mask]
        
        for _, r in pair_annot.iterrows():
            if r['stop_frame'] <= df_wide.index.max():
                df_wide.loc[r['start_frame']:r['stop_frame'], 'label'] = r['action']
    except:
        return None # B·ªè qua video l·ªói nh√£n

    return df_wide.fillna(0)

# --- 4. PIPELINE CH√çNH: G·ªòP DATA -> C√ÇN B·∫∞NG -> TRAIN ---
features = ['distance', 'velocity_m1', 'velocity_m2', 
            'dist_mean_10', 'dist_std_10', 'vel1_mean_10', 'vel2_mean_10']

# A. G·ªôp d·ªØ li·ªáu nhi·ªÅu video
all_data = []
print(f"‚è≥ ƒêang x·ª≠ l√Ω {NUM_VIDEOS_TO_TRAIN} video...")
for i in tqdm(range(NUM_VIDEOS_TO_TRAIN)):
    df = get_train_data(i)
    if df is not None and len(df) > 0:
        df = calculate_features_with_memory(df)
        all_data.append(df[features + ['label']])

df_big_train = pd.concat(all_data, ignore_index=True)
print(f"‚úÖ K√≠ch th∆∞·ªõc t·∫≠p Train th√¥: {df_big_train.shape}")

# B. C√¢n b·∫±ng d·ªØ li·ªáu (Undersampling an to√†n)
print("‚öñÔ∏è ƒêang c√¢n b·∫±ng d·ªØ li·ªáu...")
others = df_big_train[df_big_train['label'] == 'other']
actions = df_big_train[df_big_train['label'] != 'other']

min_sample = min(len(others), len(actions))
df_bal = pd.concat([
    resample(others, replace=False, n_samples=min_sample, random_state=42),
    resample(actions, replace=False, n_samples=min_sample, random_state=42)
])
print(f"‚úÖ D·ªØ li·ªáu sau c√¢n b·∫±ng: {df_bal.shape} (M·ªói phe {min_sample} d√≤ng)")

# C. Train Model
print("üöÄ ƒêang hu·∫•n luy·ªán Random Forest...")
le = LabelEncoder()
y_train = le.fit_transform(df_bal['label'])
X_train = df_bal[features]

model_big = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
model_big.fit(X_train, y_train)
print("üéâ ƒê√£ hu·∫•n luy·ªán xong model_big!")
print(f"C√°c nh√£n ƒë√£ h·ªçc: {le.classes_}")

‚è≥ ƒêang x·ª≠ l√Ω 20 video...


  0%|          | 0/20 [00:00<?, ?it/s]

‚úÖ K√≠ch th∆∞·ªõc t·∫≠p Train th√¥: (2512390, 8)
‚öñÔ∏è ƒêang c√¢n b·∫±ng d·ªØ li·ªáu...
‚úÖ D·ªØ li·ªáu sau c√¢n b·∫±ng: (48634, 8) (M·ªói phe 24317 d√≤ng)
üöÄ ƒêang hu·∫•n luy·ªán Random Forest...
üéâ ƒê√£ hu·∫•n luy·ªán xong model_big!
C√°c nh√£n ƒë√£ h·ªçc: ['approach' 'attack' 'avoid' 'chase' 'chaseattack' 'other' 'shepherd'
 'submit']


In [2]:
import pandas as pd
import numpy as np
import os
import gc
import lightgbm as lgb
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
import itertools

# --- 1. C·∫§U H√åNH ---
DATA_PATH = '/kaggle/input/MABe-mouse-behavior-detection/'
NUM_VIDEOS_TRAIN = 50 

# --- 2. H√ÄM FEATURE ENGINEERING AN TO√ÄN (FIX L·ªñI KEYERROR) ---
def calculate_features_safe(df):
    # Kh·ªüi t·∫°o m·∫∑c ƒë·ªãnh b·∫±ng 0.0 ƒë·ªÉ tr√°nh l·ªói thi·∫øu c·ªôt
    df['distance'] = 0.0
    df['velocity_m1'] = 0.0
    df['velocity_m2'] = 0.0
    
    # 1. T√≠nh Kho·∫£ng c√°ch (Ch·ªâ khi c√≥ ƒë·ªß c·∫£ 2 chu·ªôt)
    if 'mouse1_body_center_x' in df.columns and 'mouse2_body_center_x' in df.columns:
        dx = df['mouse1_body_center_x'] - df['mouse2_body_center_x']
        dy = df['mouse1_body_center_y'] - df['mouse2_body_center_y']
        df['distance'] = np.sqrt(dx**2 + dy**2)
    
    # 2. T√≠nh V·∫≠n t·ªëc M1
    if 'mouse1_body_center_x' in df.columns:
        vx1 = df['mouse1_body_center_x'].diff().fillna(0)
        vy1 = df['mouse1_body_center_y'].diff().fillna(0)
        df['velocity_m1'] = np.sqrt(vx1**2 + vy1**2)
        
    # 3. T√≠nh V·∫≠n t·ªëc M2
    if 'mouse2_body_center_x' in df.columns:
        vx2 = df['mouse2_body_center_x'].diff().fillna(0)
        vy2 = df['mouse2_body_center_y'].diff().fillna(0)
        df['velocity_m2'] = np.sqrt(vx2**2 + vy2**2)
        
    # 4. T√≠nh K√Ω ·ª©c (Rolling Window)
    # V√¨ c√°c c·ªôt tr√™n ƒë√£ ƒë∆∞·ª£c kh·ªüi t·∫°o (d√π l√† 0), n√™n ƒëo·∫°n n√†y lu√¥n an to√†n
    w = 10
    df['dist_mean_10'] = df['distance'].rolling(window=w).mean().fillna(0)
    df['vel1_mean_10'] = df['velocity_m1'].rolling(window=w).mean().fillna(0)
    df['vel2_mean_10'] = df['velocity_m2'].rolling(window=w).mean().fillna(0)
    
    return df

features = ['distance', 'velocity_m1', 'velocity_m2', 'dist_mean_10', 'vel1_mean_10', 'vel2_mean_10']

# --- 3. H√ÄM LOAD DATA "ALL-PAIRS" (ƒê√É TH√äM TRY-EXCEPT) ---
def load_train_data_all_pairs(meta_row):
    try:
        video_id = meta_row['video_id']
        lab_id = meta_row['lab_id']
        pix_per_cm = meta_row['pix_per_cm_approx'] if meta_row['pix_per_cm_approx'] > 0 else 1.0
        
        t_path = os.path.join(DATA_PATH, 'train_tracking', lab_id, f'{video_id}.parquet')
        a_path = os.path.join(DATA_PATH, 'train_annotation', lab_id, f'{video_id}.parquet')
        
        if not os.path.exists(t_path) or not os.path.exists(a_path): return None
        
        # Load & Pivot
        df_track = pd.read_parquet(t_path)
        px = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='x')
        px.columns = [f"mouse{m}_{bp}_x" for m, bp in px.columns]
        py = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='y')
        py.columns = [f"mouse{m}_{bp}_y" for m, bp in py.columns]
        df_wide = pd.concat([px, py], axis=1).sort_index(axis=1)
        
        # Interpolate & Normalize
        df_wide = df_wide.interpolate(limit=5).fillna(0)
        df_wide = df_wide / pix_per_cm 
        
        # Load Annotation
        df_annot = pd.read_parquet(a_path)
        
        mouse_ids = sorted(list(set([int(c.split('_')[0].replace('mouse', '')) for c in df_wide.columns if 'mouse' in c])))
        pairs_data = []
        
        for m1, m2 in itertools.combinations(mouse_ids, 2):
            cols1 = [c for c in df_wide.columns if f'mouse{m1}_' in c]
            cols2 = [c for c in df_wide.columns if f'mouse{m2}_' in c]
            if not cols1 or not cols2: continue
            
            df_pair = df_wide[cols1 + cols2].copy()
            rename_dict = {}
            for c in cols1: rename_dict[c] = c.replace(f'mouse{m1}_', 'mouse1_')
            for c in cols2: rename_dict[c] = c.replace(f'mouse{m2}_', 'mouse2_')
            df_pair.rename(columns=rename_dict, inplace=True)
            
            # Check xem sau khi rename c√≥ ƒë·ªß c·ªôt body_center kh√¥ng, n·∫øu kh√¥ng th√¨ h√†m safe s·∫Ω x·ª≠ l√Ω
            
            df_pair['label'] = 'other'
            mask = ((df_annot['agent_id'] == m1) & (df_annot['target_id'] == m2)) | \
                   ((df_annot['agent_id'] == m2) & (df_annot['target_id'] == m1))
            
            pair_annot = df_annot[mask]
            for _, r in pair_annot.iterrows():
                if r['stop_frame'] <= df_pair.index.max():
                    df_pair.loc[r['start_frame']:r['stop_frame'], 'label'] = r['action']
                    
            pairs_data.append(df_pair)
            
        if pairs_data:
            return pd.concat(pairs_data, ignore_index=True)
    except Exception as e:
        # print(f"Skipping video {meta_row['video_id']} due to error: {e}")
        return None
        
    return None

# --- 4. CHU·∫®N B·ªä D·ªÆ LI·ªÜU TRAIN ---
print("‚è≥ ƒêang t·∫°o d·ªØ li·ªáu 'All-Pairs' t·ª´ 50 video (Safe Mode)...")
try:
    df_meta = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
except:
    df_meta = pd.DataFrame()

all_train_dfs = []
# D√πng tqdm ƒë·ªÉ theo d√µi ti·∫øn ƒë·ªô
for i in tqdm(range(min(NUM_VIDEOS_TRAIN, len(df_meta)))):
    df = load_train_data_all_pairs(df_meta.iloc[i])
    if df is not None:
        # D√πng h√†m SAFE thay v√¨ h√†m th∆∞·ªùng
        df = calculate_features_safe(df) 
        all_train_dfs.append(df[features + ['label']])

if len(all_train_dfs) > 0:
    df_train_big = pd.concat(all_train_dfs, ignore_index=True)
    del all_train_dfs
    gc.collect()

    print(f"‚úÖ D·ªØ li·ªáu th√¥: {df_train_big.shape}")

    # --- 5. C√ÇN B·∫∞NG D·ªÆ LI·ªÜU ---
    print("‚öñÔ∏è ƒêang c√¢n b·∫±ng d·ªØ li·ªáu...")
    others = df_train_big[df_train_big['label'] == 'other']
    actions = df_train_big[df_train_big['label'] != 'other']

    n_sample = min(len(others), len(actions))
    # N·∫øu video qu√° √≠t action, ta l·∫•y to√†n b·ªô action v√† m·ªôt ph·∫ßn other
    if n_sample == 0: n_sample = 1000 # Fallback
    
    df_train_bal = pd.concat([
        resample(others, replace=False, n_samples=n_sample, random_state=42),
        actions # L·∫•y h·∫øt action (v√¨ th∆∞·ªùng action √≠t h∆°n other)
    ])
    
    # N·∫øu action nhi·ªÅu h∆°n other (hi·∫øm), ta c≈©ng c√≥ th·ªÉ resample action. 
    # Nh∆∞ng code tr√™n ∆∞u ti√™n gi·ªØ l·∫°i to√†n b·ªô h√†nh vi hi·∫øm.
    
    print(f"‚úÖ D·ªØ li·ªáu train cu·ªëi c√πng: {df_train_bal.shape}")

    # Label Encoding
    le = LabelEncoder()
    y_train = le.fit_transform(df_train_bal['label'])
    X_train = df_train_bal[features]

    # --- 6. HU·∫§N LUY·ªÜN LIGHTGBM ---
    print("üöÄ ƒêang hu·∫•n luy·ªán LightGBM...")
    params = {
        'objective': 'multiclass',
        'num_class': len(le.classes_),
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'n_jobs': -1,
        'random_state': 42,
        'learning_rate': 0.05,
        'n_estimators': 500,
        'verbosity': -1
    }

    model = lgb.LGBMClassifier(**params)
    model.fit(X_train, y_train)
    print("üéâ Hu·∫•n luy·ªán xong!")

else:
    print("‚ùå Kh√¥ng load ƒë∆∞·ª£c d·ªØ li·ªáu n√†o. Ki·ªÉm tra l·∫°i ƒë∆∞·ªùng d·∫´n ho·∫∑c dataset.")

# --- 7. H√ÄM POST-PROCESSING ---
def run_length_encoding_pro(predictions, agent_id, target_id):
    events = []
    if len(predictions) == 0: return events
    
    current_label = predictions[0]
    start_frame = 0
    
    for i in range(1, len(predictions)):
        if predictions[i] != current_label:
            if current_label != 'other':
                events.append({
                    'agent_id': agent_id,
                    'target_id': target_id,
                    'action': current_label,
                    'start_frame': start_frame,
                    'stop_frame': i - 1
                })
            current_label = predictions[i]
            start_frame = i
            
    if current_label != 'other':
        events.append({
            'agent_id': agent_id,
            'target_id': target_id,
            'action': current_label,
            'start_frame': start_frame,
            'stop_frame': len(predictions) - 1
        })
    return events

# --- 8. T·∫†O SUBMISSION (SAFE MODE) ---
if 'model' in locals(): # Ch·ªâ ch·∫°y n·∫øu train th√†nh c√¥ng
    print("üìù ƒêang t·∫°o submission...")
    try:
        df_test_meta = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
    except: df_test_meta = pd.DataFrame()

    submission_rows = []
    row_id_counter = 0

    for idx, row in tqdm(df_test_meta.iterrows(), total=len(df_test_meta)):
        try:
            video_id = row['video_id']
            lab_id = row['lab_id']
            pix_per_cm = row['pix_per_cm_approx'] if row['pix_per_cm_approx'] > 0 else 1.0
            
            t_path = os.path.join(DATA_PATH, 'test_tracking', lab_id, f'{video_id}.parquet')
            if not os.path.exists(t_path): continue
            
            df_track = pd.read_parquet(t_path)
            px = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='x')
            px.columns = [f"mouse{m}_{bp}_x" for m, bp in px.columns]
            py = df_track.pivot(index='video_frame', columns=['mouse_id', 'bodypart'], values='y')
            py.columns = [f"mouse{m}_{bp}_y" for m, bp in py.columns]
            df_wide = pd.concat([px, py], axis=1).sort_index(axis=1)
            
            df_wide = df_wide.interpolate(limit=5).fillna(0)
            df_wide = df_wide / pix_per_cm
            
            mouse_ids = sorted(list(set([int(c.split('_')[0].replace('mouse', '')) for c in df_wide.columns if 'mouse' in c])))
            
            for m1, m2 in itertools.permutations(mouse_ids, 2):
                cols1 = [c for c in df_wide.columns if f'mouse{m1}_' in c]
                cols2 = [c for c in df_wide.columns if f'mouse{m2}_' in c]
                if not cols1 or not cols2: continue
                
                df_pair = df_wide[cols1 + cols2].copy()
                rename_dict = {}
                for c in cols1: rename_dict[c] = c.replace(f'mouse{m1}_', 'mouse1_')
                for c in cols2: rename_dict[c] = c.replace(f'mouse{m2}_', 'mouse2_')
                df_pair.rename(columns=rename_dict, inplace=True)
                
                # D√πng h√†m SAFE features
                df_pair = calculate_features_safe(df_pair)
                
                X_test = pd.DataFrame(0.0, index=df_pair.index, columns=features)
                for c in features:
                    if c in df_pair.columns: X_test[c] = df_pair[c]
                
                # D·ª± ƒëo√°n
                y_pred_idx = model.predict(X_test)
                y_pred_lbl = le.inverse_transform(y_pred_idx)
                
                events = run_length_encoding_pro(y_pred_lbl, m1, m2)
                
                for event in events:
                    submission_rows.append({
                        'row_id': row_id_counter,
                        'video_id': video_id,
                        'agent_id': event['agent_id'],
                        'target_id': event['target_id'],
                        'action': event['action'],
                        'start_frame': event['start_frame'],
                        'stop_frame': event['stop_frame']
                    })
                    row_id_counter += 1
                    
            del df_wide
            gc.collect()
        except Exception as e:
            print(f"Error processing test video {video_id}: {e}")
            continue

    # L∆∞u file
    sub = pd.DataFrame(submission_rows)
    if len(sub) == 0:
        sub = pd.DataFrame(columns=['row_id', 'video_id', 'agent_id', 'target_id', 'action', 'start_frame', 'stop_frame'])

    sub.to_csv('submission.csv', index=False)
    print(f"‚úÖ ƒê√£ t·∫°o submission.csv v·ªõi {len(sub)} d√≤ng.")
    display(sub.head())

‚è≥ ƒêang t·∫°o d·ªØ li·ªáu 'All-Pairs' t·ª´ 50 video (Safe Mode)...


  0%|          | 0/50 [00:00<?, ?it/s]

‚úÖ D·ªØ li·ªáu th√¥: (8829739, 7)
‚öñÔ∏è ƒêang c√¢n b·∫±ng d·ªØ li·ªáu...
‚úÖ D·ªØ li·ªáu train cu·ªëi c√πng: (418352, 7)
üöÄ ƒêang hu·∫•n luy·ªán LightGBM...
üéâ Hu·∫•n luy·ªán xong!
üìù ƒêang t·∫°o submission...


  0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ ƒê√£ t·∫°o submission.csv v·ªõi 6285 d√≤ng.


Unnamed: 0,row_id,video_id,agent_id,target_id,action,start_frame,stop_frame
0,0,438887472,1,2,attack,0,0
1,1,438887472,1,2,approach,133,133
2,2,438887472,1,2,avoid,138,138
3,3,438887472,1,2,avoid,140,150
4,4,438887472,1,2,avoid,152,152
