In [15]:
import os
import cv2
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from pathlib import Path  
from scipy import stats


In [2]:
def get_segment_parameters(segment_name):
    """Extract duration and overlap status from segment folder name"""
    parts = segment_name.split('_')
    duration = int(parts[1])
    overlap = 'overlap' in segment_name
    return duration, overlap

In [3]:
def load_frames(video_path):
    """Load and preprocess video frames"""
    cap = cv2.VideoCapture(str(video_path))
    frames = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frames.append(gray)
    cap.release()
    return frames

In [4]:
def calculate_motion_features(frames, window_seconds, fps):
    """Calculate comprehensive motion features with temporal window"""
    window_frames = int(window_seconds * fps)
    flow_magnitudes = []
    flow_orientations = []
    temporal_gradients = []
    
    prev_frame = frames[0]
    for i in range(len(frames)):
        # Optical flow features
        ref_idx = max(0, i - window_frames)
        flow = cv2.calcOpticalFlowFarneback(
            frames[ref_idx], frames[i], None, 0.5, 3, 15, 3, 5, 1.2, 0
        )
        magnitude = np.sqrt(flow[...,0]**2 + flow[...,1]**2)
        orientation = np.arctan2(flow[...,1], flow[...,0])
        
        flow_magnitudes.extend(magnitude.flatten())
        flow_orientations.extend(orientation.flatten())
        
        # Temporal gradient features
        if i > 0:
            diff = cv2.absdiff(frames[i], frames[i-1])
            temporal_gradients.append(np.mean(diff))
    
    # Optical flow statistics
    flow_mean = np.nanmean(flow_magnitudes) if flow_magnitudes else 0
    flow_std = np.nanstd(flow_magnitudes) if flow_magnitudes else 0
    
    # Orientation circular statistics
    orientation_sin = np.mean(np.sin(flow_orientations))
    orientation_cos = np.mean(np.cos(flow_orientations))
    orientation_mean = np.arctan2(orientation_sin, orientation_cos)
    orientation_std = np.sqrt(-2 * np.log(np.hypot(orientation_sin, orientation_cos)))
    
    # Temporal gradient statistics
    grad_mean = np.mean(temporal_gradients) if temporal_gradients else 0
    grad_std = np.std(temporal_gradients) if temporal_gradients else 0
    
    # Frequency analysis
    if len(flow_magnitudes) > 1:
        fft = np.fft.fft(flow_magnitudes)
        fft_freq = np.fft.fftfreq(len(fft))
        dominant_freq = np.abs(fft_freq[np.argmax(np.abs(fft[1:])) + 1])
    else:
        dominant_freq = 0
    
    return {
        # Basic flow features
        'flow_mean': flow_mean,
        'flow_std': flow_std,
        'flow_max': np.max(flow_magnitudes) if flow_magnitudes else 0,
        
        # Orientation features
        'orientation_mean': orientation_mean,
        'orientation_std': orientation_std,
        
        # Temporal dynamics
        'temporal_grad_mean': grad_mean,
        'temporal_grad_std': grad_std,
        'temporal_grad_max': np.max(temporal_gradients) if temporal_gradients else 0,
        
        # Frequency analysis
        'dominant_freq': dominant_freq,
        
        # Motion complexity
        'flow_entropy': stats.entropy(np.histogram(flow_magnitudes, bins=20)[0]) 
                        if flow_magnitudes else 0,
        'motion_consistency': flow_std / (flow_mean + 1e-6)
    }

In [5]:
def extract_segment_features(video_path, segment_name):
    """Main feature extraction function"""
    # Get segment parameters
    duration_seconds, is_overlap = get_segment_parameters(segment_name)
    
    # Load video data
    frames = load_frames(video_path)
    if not frames:
        return None
    
    # Calculate actual FPS
    fps = len(frames) / duration_seconds
    
    # Calculate motion features
    motion_features = calculate_motion_features(frames, duration_seconds, fps)
    
    # Create feature dictionary
    features = {
        'video_name': video_path.name,
        'state': Path(video_path).parent.parent.name,
        'view': 'angle' if 'angle' in video_path.name.lower() else 'front',
        'segment_duration': duration_seconds,
        'is_overlap': is_overlap,
        'total_frames': len(frames),
        'actual_fps': fps,
        **motion_features
    }
    
    return features


In [6]:
def process_all_segments(base_dir, output_path):
    """Process all videos in directory structure"""
    base_path = Path(base_dir)
    all_features = []
    
    # Iterate through all state directories
    for state_dir in base_path.iterdir():
        if not state_dir.is_dir():
            continue
            
        # Process each segment type
        for segment_dir in state_dir.iterdir():
            if not segment_dir.is_dir():
                continue
                
            segment_name = segment_dir.name
            video_files = list(segment_dir.glob('*.avi'))
            
            # Process videos with progress bar
            for video_path in tqdm(video_files, 
                                 desc=f"{state_dir.name} - {segment_name}"):
                features = extract_segment_features(video_path, segment_name)
                if features:
                    all_features.append(features)
    
    # Create DataFrame and save
    if all_features:
        df = pd.DataFrame(all_features)
        df.to_csv(output_path, index=False)
        print(f"✅ Features saved to {output_path}")
    else:
        print("⚠️ No features extracted - check input data")

if __name__ == "__main__":
    # Configuration
    BASE_DIR = "evm_segmented_videos"
    OUTPUT_CSV = "optical_flow_features.csv"
    
    # Run processing
    process_all_segments(BASE_DIR, OUTPUT_CSV)

Bearing_fault - segmented_10: 100%|██████████| 18/18 [02:16<00:00,  7.56s/it]
Bearing_fault - segmented_10_overlap: 100%|██████████| 34/34 [04:36<00:00,  8.13s/it]
Bearing_fault - segmented_15: 100%|██████████| 12/12 [02:23<00:00, 11.99s/it]
Bearing_fault - segmented_15_overlap: 100%|██████████| 22/22 [05:41<00:00, 15.54s/it]
Bearing_fault - segmented_5: 100%|██████████| 36/36 [03:03<00:00,  5.10s/it]
Bearing_fault - segmented_5_overlap: 100%|██████████| 72/72 [06:27<00:00,  5.38s/it]
Normal_state - segmented_10: 100%|██████████| 18/18 [03:11<00:00, 10.61s/it]
Normal_state - segmented_10_overlap: 100%|██████████| 34/34 [06:14<00:00, 11.00s/it]
Normal_state - segmented_15: 100%|██████████| 12/12 [02:43<00:00, 13.64s/it]
Normal_state - segmented_15_overlap: 100%|██████████| 22/22 [04:25<00:00, 12.05s/it]
Normal_state - segmented_5: 100%|██████████| 36/36 [02:20<00:00,  3.90s/it]
Normal_state - segmented_5_overlap: 100%|██████████| 72/72 [04:42<00:00,  3.92s/it]
Unbalance_weight - segment

✅ Features saved to optical_flow_features.csv


In [18]:
def create_segmented_datasets():
    """Split main dataset into duration-specific subsets"""
    df = pd.read_csv('optical_flow_features.csv')
    os.makedirs('final_dataset_csv', exist_ok=True)
    
    segments = [
        ('5s', 5, False),
        ('5s_overlap', 5, True),
        ('10s', 10, False),
        ('10s_overlap', 10, True),
        ('15s', 15, False),
        ('15s_overlap', 15, True)
    ]
    
    for name, duration, overlap in segments:
        mask = (df['segment_duration'] == duration) & (df['is_overlap'] == overlap)
        df_seg = df[mask].copy()
        df_seg.to_csv(f'final_dataset_csv/{name}.csv', index=False)
        print(f"Created {name}.csv with {len(df_seg)} samples")

if __name__ == "__main__":
    create_segmented_datasets()

Created 5s.csv with 108 samples
Created 5s_overlap.csv with 216 samples
Created 10s.csv with 54 samples
Created 10s_overlap.csv with 102 samples
Created 15s.csv with 36 samples
Created 15s_overlap.csv with 66 samples
