##  Environment and Path Setup

In [1]:
import os
import sys
import pickle
import cv2
import numpy as np
import xml.etree.ElementTree as ET
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# --- Path Configuration ---
# The base path now includes the 'JAAD' subfolder.
BASE_INPUT_PATH = '/kaggle/input/jaad-with-annotations/JAAD'
OUTPUT_DATA_PATH = '/kaggle/working/jaad_output'

# Create the output directory if it doesn't exist
if not os.path.exists(OUTPUT_DATA_PATH):
    print(f"Creating output directory at: {OUTPUT_DATA_PATH}")
    os.makedirs(OUTPUT_DATA_PATH)

print("Paths configured:")
print(f"Base Input Path: {BASE_INPUT_PATH}")
print(f"Output Path: {OUTPUT_DATA_PATH}")

Creating output directory at: /kaggle/working/jaad_output
Paths configured:
Base Input Path: /kaggle/input/jaad-with-annotations/JAAD
Output Path: /kaggle/working/jaad_output


## File Mapping

In [2]:
# --- Define sub-directory paths (UPDATED for multi-level annotations) ---
VIDEOS_PATH = os.path.join(BASE_INPUT_PATH, 'JAAD clips')

# Create a dictionary to hold the paths for all annotation types
ANNOTATION_PATHS = {
    'primary':    os.path.join(BASE_INPUT_PATH, 'JAAD annotations', 'annotations'),
    'appearance': os.path.join(BASE_INPUT_PATH, 'JAAD annotations', 'annotations_appearance'),
    'attributes': os.path.join(BASE_INPUT_PATH, 'JAAD annotations', 'annotations_attributes'),
    'traffic':    os.path.join(BASE_INPUT_PATH, 'JAAD annotations', 'annotations_traffic'),
    'vehicle':    os.path.join(BASE_INPUT_PATH, 'JAAD annotations', 'annotations_vehicle'),
}

print("--- Paths Configuration ---")
print(f"Videos Path: {VIDEOS_PATH}")
for key, path in ANNOTATION_PATHS.items():
    print(f"Annotations ({key}): {path}")

# --- List all video and annotation files ---
video_files = sorted([f for f in os.listdir(VIDEOS_PATH) if f.endswith('.mp4')])
print(f"\nFound {len(video_files)} video files.")

# Create a dictionary of file lists for faster lookups.
annotation_file_lists = {}
for key, path in ANNOTATION_PATHS.items():
    if os.path.exists(path):
        annotation_file_lists[key] = set(os.listdir(path))
        print(f"Found {len(annotation_file_lists[key])} '{key}' annotation files.")
    else:
        print(f"Warning: Directory not found for '{key}' annotations: {path}")
        annotation_file_lists[key] = set()

# --- Create the multi-level mapping ---
file_mapping = {}

# Iterate through the video files
for video_filename in video_files:
    base_name = os.path.splitext(video_filename)[0]
    current_video_map = {
        'video_path': os.path.join(VIDEOS_PATH, video_filename),
        'annotations': {}
    }
    
    primary_annot_file = f"{base_name}.xml"
    if 'primary' in annotation_file_lists and primary_annot_file in annotation_file_lists['primary']:
        current_video_map['annotations']['primary'] = os.path.join(ANNOTATION_PATHS['primary'], primary_annot_file)

    for key in ['appearance', 'attributes', 'traffic', 'vehicle']:
        expected_annot_file = f"{base_name}_{key}.xml"
        if key in annotation_file_lists and expected_annot_file in annotation_file_lists[key]:
            current_video_map['annotations'][key] = os.path.join(ANNOTATION_PATHS[key], expected_annot_file)
            
    file_mapping[base_name] = current_video_map

print(f"\nSuccessfully created a mapping for {len(file_mapping)} video clips.")
print("\n--- Example of Mapped Files ---")
example_key = list(file_mapping.keys())[0]
print(f"ID: {example_key}")
print(f"  -> Video Path: {file_mapping[example_key]['video_path']}")
for key, path in file_mapping[example_key]['annotations'].items():
    print(f"    - {key}: {path}")

--- Paths Configuration ---
Videos Path: /kaggle/input/jaad-with-annotations/JAAD/JAAD clips
Annotations (primary): /kaggle/input/jaad-with-annotations/JAAD/JAAD annotations/annotations
Annotations (appearance): /kaggle/input/jaad-with-annotations/JAAD/JAAD annotations/annotations_appearance
Annotations (attributes): /kaggle/input/jaad-with-annotations/JAAD/JAAD annotations/annotations_attributes
Annotations (traffic): /kaggle/input/jaad-with-annotations/JAAD/JAAD annotations/annotations_traffic
Annotations (vehicle): /kaggle/input/jaad-with-annotations/JAAD/JAAD annotations/annotations_vehicle

Found 346 video files.
Found 346 'primary' annotation files.
Found 346 'appearance' annotation files.
Found 346 'attributes' annotation files.
Found 346 'traffic' annotation files.
Found 346 'vehicle' annotation files.

Successfully created a mapping for 346 video clips.

--- Example of Mapped Files ---
ID: video_0001
  -> Video Path: /kaggle/input/jaad-with-annotations/JAAD/JAAD clips/video_00

## Data Parsing

In [3]:
def parse_jaad_xml(xml_path):
    """
    Parses a JAAD primary annotation XML file to extract pedestrian data.
    
    Args:
        xml_path (str): The file path to the XML annotation.

    Returns:
        dict: A dictionary where keys are pedestrian IDs and values are another
              dictionary mapping frame numbers to their attributes (bbox, behavior).
    """
    if not os.path.exists(xml_path):
        print(f"Warning: XML path does not exist: {xml_path}")
        return {}
        
    try:
        tree = ET.parse(xml_path)
    except ET.ParseError:
        print(f"Warning: Could not parse XML file: {xml_path}")
        return {}
        
    root = tree.getroot()
    
    pedestrians_data = {}
    
    # Find all 'track' elements with label 'pedestrian'
    for track in root.findall(".//track[@label='pedestrian']"):
        ped_id = track.get('id')
        pedestrians_data[ped_id] = {}
        
        # Iterate over each frame's bounding box for this pedestrian
        for box in track.findall('box'):
            frame_num = int(box.get('frame'))
            
            # Bounding box coordinates
            bbox = {
                'xtl': float(box.get('xtl')),
                'ytl': float(box.get('ytl')),
                'xbr': float(box.get('xbr')),
                'ybr': float(box.get('ybr')),
            }
            
            # --- FIX STARTS HERE ---
            behavior = {}
            # First, find the parent 'behavior' attribute element
            behavior_element = box.find('attribute[@name="behavior"]')
            
            # CRITICAL: Check if this element exists before trying to iterate its children
            if behavior_element is not None:
                # If it exists, now we can safely loop through its children
                for beh in behavior_element:
                    behavior[beh.get('name')] = beh.text
            # --- FIX ENDS HERE ---

            pedestrians_data[ped_id][frame_num] = {
                'bbox': bbox,
                'behavior': behavior # This will be an empty dict {} if no behavior tag was found
            }
            
    return pedestrians_data

# --- Example of parsing a single file ---
example_video_id = 'video_0001'
# Ensure the key exists before accessing it
if example_video_id in file_mapping and 'primary' in file_mapping[example_video_id]['annotations']:
    example_xml_path = file_mapping[example_video_id]['annotations']['primary']
    parsed_data_example = parse_jaad_xml(example_xml_path)

    # Print a snippet of the parsed data for one pedestrian
    if parsed_data_example:
        example_ped_id = list(parsed_data_example.keys())[0]
        print(f"--- Parsed Data Example for Video '{example_video_id}', Pedestrian '{example_ped_id}' ---")
        # Check if frames exist for the pedestrian
        if parsed_data_example[example_ped_id]:
            example_frame_1 = parsed_data_example[example_ped_id].get(1, "No data for frame 1")
            example_frame_2 = parsed_data_example[example_ped_id].get(2, "No data for frame 2")
            print(f"Frame 1 data: {example_frame_1}")
            print(f"Frame 2 data: {example_frame_2}")
        else:
            print(f"No frames found for pedestrian {example_ped_id}")
    else:
        print(f"No pedestrian data parsed for video {example_video_id}")
else:
    print(f"Primary annotation for '{example_video_id}' not found in file mapping.")

--- Parsed Data Example for Video 'video_0001', Pedestrian 'None' ---
Frame 1 data: {'bbox': {'xtl': 1402.0, 'ytl': 655.0, 'xbr': 1490.0, 'ybr': 894.0}, 'behavior': {}}
Frame 2 data: {'bbox': {'xtl': 1406.0, 'ytl': 656.0, 'xbr': 1493.0, 'ybr': 897.0}, 'behavior': {}}


## Data Acquisition and Splitting

In [4]:
all_videos_data = {}

print("Parsing annotations for all videos...")
# Using tqdm for a progress bar
for video_id, paths in tqdm(file_mapping.items(), desc="Processing Videos"):
    if 'primary' in paths['annotations']:
        xml_path = paths['annotations']['primary']
        all_videos_data[video_id] = parse_jaad_xml(xml_path)

print(f"\nSuccessfully parsed data for {len(all_videos_data)} videos.")

# --- Split video IDs into train, validation, and test sets ---
video_ids = list(all_videos_data.keys())
train_val_ids, test_ids = train_test_split(video_ids, test_size=0.15, random_state=42)
train_ids, val_ids = train_test_split(train_val_ids, test_size=0.15, random_state=42) # 0.15 * 0.85 = ~13%

print(f"\nDataset Split:")
print(f"Total Videos: {len(video_ids)}")
print(f"Training set: {len(train_ids)} videos")
print(f"Validation set: {len(val_ids)} videos")
print(f"Test set: {len(test_ids)} videos")

# Store the splits for later use
dataset_splits = {
    'train': train_ids,
    'val': val_ids,
    'test': test_ids
}

Parsing annotations for all videos...


Processing Videos:   0%|          | 0/346 [00:00<?, ?it/s]


Successfully parsed data for 346 videos.

Dataset Split:
Total Videos: 346
Training set: 249 videos
Validation set: 45 videos
Test set: 52 videos


In [5]:
!pip install -q ultralytics torch torchvision numpy opencv-python tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m91.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m66.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m

In [16]:
import torch
import torch.nn as nn
import numpy as np
import cv2
from ultralytics import YOLO
from tqdm.notebook import tqdm

# --- 1. Motion Activity: Transformer Architecture ---
class PoseTransformer(nn.Module):
    def __init__(self, num_keypoints=17, embed_dim=64, nhead=4, num_layers=2, num_classes=4):
        super().__init__()
        self.input_projection = nn.Linear(num_keypoints * 2, embed_dim)
        self.pos_encoder = nn.Parameter(torch.randn(1, 50, embed_dim))
        encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=nhead, dim_feedforward=128, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        B, T, F = x.shape
        x = self.input_projection(x) + self.pos_encoder[:, :T, :]
        x = self.transformer(x)
        output = self.classifier(x[:, -1, :])
        return output

# --- Helper for Loading Models ---
def load_models():
    pose_model = YOLO('yolov8n-pose.pt')
    motion_model = PoseTransformer()
    motion_model.eval()
    return pose_model, motion_model

# --- Main Feature Extractor Class with FIX ---
class JAADFeatureExtractor:
    def __init__(self):
        print("Loading extraction models...")
        # Removed seg_model as it wasn't used in the final implementation
        self.pose_model, self.motion_model = load_models()
        print("Models loaded.")
        self.focal_length_px = 1000
        self.avg_human_height_m = 1.7

    def get_pose_keypoints(self, img, bbox):
        x1, y1, x2, y2 = map(int, [bbox['xtl'], bbox['ytl'], bbox['xbr'], bbox['ybr']])
        h_img, w_img = img.shape[:2]
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w_img, x2), min(h_img, y2)
        if x2 - x1 < 10 or y2 - y1 < 10: return None
        crop = img[y1:y2, x1:x2]
        results = self.pose_model(crop, verbose=False)
        if results[0].keypoints is not None and results[0].keypoints.xy.shape[0] > 0:
            kpts = results[0].keypoints.xy[0].cpu().numpy()
            kpts[:, 0] += x1; kpts[:, 1] += y1
            return kpts
        return None

    def extract_motion_activity(self, pose_sequence):
        if len(pose_sequence) < 2: return "undefined (insufficient frames)"
        seq_tensor = torch.tensor(np.array(pose_sequence)).float().flatten(start_dim=1).unsqueeze(0)
        with torch.no_grad():
            logits = self.motion_model(seq_tensor)
            action_idx = torch.argmax(logits, dim=1).item()
        actions = {0: 'standing', 1: 'walking', 2: 'starting_to_cross', 3: 'crossing'}
        return actions.get(action_idx, 'unknown')

    # --- CORRECTED METHOD SIGNATURE ---
    def extract_proximity_to_road(self, img_shape, bbox):
        h, w = img_shape[:2]
        road_mask = np.zeros((h, w), dtype=np.uint8); road_mask[int(h*0.7):, :] = 1
        ped_x, ped_y = int((bbox['xtl'] + bbox['xbr']) / 2), int(bbox['ybr'])
        ped_y, ped_x = min(ped_y, h - 1), min(ped_x, w - 1)
        if road_mask[ped_y, ped_x] > 0: return 0.0, True
        dist_matrix = cv2.distanceTransform(1 - road_mask, cv2.DIST_L2, 5)
        distance_px = dist_matrix[ped_y, ped_x]
        return float(distance_px), bool(distance_px < 50)

    def extract_distance(self, bbox):
        h_img = bbox['ybr'] - bbox['ytl']
        if h_img <= 0: return None
        return (self.focal_length_px * self.avg_human_height_m) / h_img

    def extract_orientation(self, kpts):
        if kpts is None or np.all(kpts[5] == 0) or np.all(kpts[6] == 0): return None
        shoulder_vec = kpts[5] - kpts[6]
        return np.degrees(np.arctan2(shoulder_vec[1], shoulder_vec[0]))

    def extract_gaze(self, kpts):
        if kpts is None or np.all(kpts[0] == 0) or (np.all(kpts[1] == 0) and np.all(kpts[2] == 0)): return None
        eye_mid = (kpts[1] + kpts[2]) / 2.0
        gaze_vec = kpts[0] - eye_mid
        gaze_angle = np.degrees(np.arctan2(gaze_vec[1], gaze_vec[0]))
        return {'vector': gaze_vec.tolist(), 'angle_2d': gaze_angle}

In [17]:
# Initialize the extractor once. This will load the new class definition from Cell 1.
extractor = JAADFeatureExtractor()

# This dictionary will store all results
all_videos_features = {}

# Main loop to iterate over every video in the dataset
for vid_id in tqdm(all_videos_data.keys(), desc="Processing All Videos"):
    video_path = file_mapping[vid_id]['video_path']
    ped_data_for_video = all_videos_data[vid_id]
    
    if not ped_data_for_video:
        continue

    features_for_current_video = {}
    cap = cv2.VideoCapture(video_path)
    
    # Inner loop for every pedestrian in the current video
    for ped_id, frames_data in ped_data_for_video.items():
        pedestrian_features = {}
        pose_history = []
        
        for frame_num in sorted(frames_data.keys()):
            cap.set(cv2.CAP_PROP_POS_FRAMES, frame_num - 1)
            ret, frame = cap.read()
            if not ret: break
            
            bbox = frames_data[frame_num]['bbox']
            kpts = extractor.get_pose_keypoints(frame, bbox)
            
            if kpts is not None:
                pose_history.append(kpts)
            if len(pose_history) > 16:
                pose_history.pop(0)

            # --- CORRECTED AND EFFICIENT CALL ---
            # Call the function once and store its two return values
            prox_dist, is_near = extractor.extract_proximity_to_road(frame.shape, bbox)

            pedestrian_features[frame_num] = {
                'F1_Motion': extractor.extract_motion_activity(pose_history),
                'F2_Proximity_px': prox_dist,
                'F2_IsNearRoad': is_near,
                'F3_Distance_m': extractor.extract_distance(bbox),
                'F4_Orientation_deg': extractor.extract_orientation(kpts),
                'F5_Gaze': extractor.extract_gaze(kpts)
            }
            
        features_for_current_video[ped_id] = pedestrian_features

    cap.release()
    all_videos_features[vid_id] = features_for_current_video

print("\n--- Full Dataset Extraction Complete ---")
print(f"Successfully processed {len(all_videos_features)} videos.")

# Display a sample of the extracted data
if all_videos_features:
    sample_vid_id = list(all_videos_features.keys())[0]
    if all_videos_features[sample_vid_id]:
        sample_ped_id = list(all_videos_features[sample_vid_id].keys())[0]
        if all_videos_features[sample_vid_id][sample_ped_id]:
            sample_frame_num = list(all_videos_features[sample_vid_id][sample_ped_id].keys())[0]
            print("\n--- Example of Extracted Data Structure ---")
            print(f"Video ID: {sample_vid_id}")
            print(f"  -> Pedestrian ID: {sample_ped_id}")
            print(f"    -> Frame Number: {sample_frame_num}")
            print(f"      -> Features: {all_videos_features[sample_vid_id][sample_ped_id][sample_frame_num]}")

Loading extraction models...
Models loaded.


Processing All Videos:   0%|          | 0/346 [00:00<?, ?it/s]


--- Full Dataset Extraction Complete ---
Successfully processed 320 videos.

--- Example of Extracted Data Structure ---
Video ID: video_0001
  -> Pedestrian ID: None
    -> Frame Number: 0
      -> Features: {'F1_Motion': 'undefined (insufficient frames)', 'F2_Proximity_px': 0.0, 'F2_IsNearRoad': True, 'F3_Distance_m': 7.142857142857143, 'F4_Orientation_deg': -0.99080575, 'F5_Gaze': {'vector': [1.058837890625, 3.96221923828125], 'angle_2d': 75.03826}}


In [18]:
import pickle
import json
import os

# Define the output path in the Kaggle working directory
output_dir = "/kaggle/working/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# --- Option 1: Save as a Pickle file (Recommended for Python) ---
# Pickle is fast, efficient, and preserves Python data types perfectly.
# It's the ideal choice if you plan to load this data back into another Python script.
try:
    pickle_path = os.path.join(output_dir, "jaad_extracted_features.pkl")
    print(f"\n--- Saving data to Pickle file ---")
    print(f"File path: {pickle_path}")

    with open(pickle_path, 'wb') as f:
        # 'wb' stands for 'write binary', which is required for pickle
        pickle.dump(all_videos_features, f)

    print("✅ Successfully saved features to Pickle file.")
    file_size_mb = os.path.getsize(pickle_path) / (1024 * 1024)
    print(f"File size: {file_size_mb:.2f} MB")

except Exception as e:
    print(f"❌ Error saving to Pickle: {e}")


# --- Option 2: Save as a JSON file (For readability or use with other languages) ---
# JSON is a human-readable text format. It's great for inspecting the data manually
# or sharing it with applications written in other languages (e.g., JavaScript).
# Note: This can be much slower and result in a larger file than pickle for large datasets.

# Uncomment the following lines to also save as JSON
# try:
#     json_path = os.path.join(output_dir, "jaad_extracted_features.json")
#     print(f"\n--- Saving data to JSON file ---")
#     print(f"File path: {json_path}")
#
#     with open(json_path, 'w') as f:
#         # 'w' stands for 'write text'
#         # indent=4 makes the file readable but increases its size
#         json.dump(all_videos_features, f, indent=4)
#
#     print("✅ Successfully saved features to JSON file.")
#     file_size_mb = os.path.getsize(json_path) / (1024 * 1024)
#     print(f"File size: {file_size_mb:.2f} MB")
#
# except Exception as e:
#     print(f"❌ Error saving to JSON: {e}")


--- Saving data to Pickle file ---
File path: /kaggle/working/jaad_extracted_features.pkl
✅ Successfully saved features to Pickle file.
File size: 4.91 MB
