In [6]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import logging

class DataPreprocessor:
    def __init__(self, data_paths, log_path="data_cleaning.log"):
        self.data_paths = data_paths  # List of directories to process

        # Set up logging
        logging.basicConfig(filename=log_path, level=logging.DEBUG,
                            format='%(asctime)s - %(levelname)s - %(message)s')
        logging.info("DataPreprocessor initialized")

    def check_data_quality(self, df, file_name):
        """Check quality of skeleton data with detailed logging."""
        quality_pass = True
        
        # Check for NaN values
        if df.isna().any().any():
            nan_rows = df[df.isna().any(axis=1)]
            logging.warning(f"{file_name} failed quality check due to NaN values in rows {nan_rows.index.tolist()}")
            quality_pass = False
        
        # Check for zero values in multiple consecutive frames
        zero_frames = (df == 0).all(axis=1)
        consecutive_zeros = zero_frames.rolling(window=5, min_periods=5).sum() > 0
        if consecutive_zeros.any():
            zero_rows = consecutive_zeros[consecutive_zeros].index.tolist()
            logging.warning(f"{file_name} failed quality check due to consecutive zero frames in rows {zero_rows}")
            quality_pass = False
            
        # Check sudden large jumps in joint positions
        diffs = np.abs(df.diff())
        large_jump_rows = diffs[(diffs > 1000).any(axis=1)].index.tolist()
        if large_jump_rows:
            logging.warning(f"{file_name} failed quality check due to unrealistic movement in rows {large_jump_rows}")
            quality_pass = False
            
        # Check if total frames are too short
        if len(df) < 50:
            logging.warning(f"{file_name} failed quality check due to insufficient frames ({len(df)} frames)")
            quality_pass = False
            
        return quality_pass

    def process_data_files(self):
        """Process all data files and identify good quality samples with detailed logging."""
        good_files = []
        bad_files = []
        
        for data_path in self.data_paths:
            all_files = list(Path(data_path).glob("*.csv"))
            
            for file in all_files:
                try:
                    df = pd.read_csv(file, header=None)
                    hip_indices = [12, 13]  # Use specific joint indices (e.g., hip) for quality analysis
                    hip_motion = df.iloc[:, np.array([j for i in hip_indices for j in range(i*3, (i+1)*3)])]
                    
                    if self.check_data_quality(hip_motion, file):
                        good_files.append(str(file))
                        logging.info(f"{file} passed quality check")
                    else:
                        bad_files.append(str(file))
                        logging.info(f"{file} failed quality check")
                        
                except Exception as e:
                    logging.error(f"Error processing {file}: {e}")
                    bad_files.append(str(file))

        return good_files, bad_files

    def get_data_stats(self):
        """Get statistics about data quality with detailed logging."""
        good_files, bad_files = self.process_data_files()
        
        # Overall statistics
        stats = {
            "total_files": len(good_files) + len(bad_files),
            "good_quality": len(good_files),
            "bad_quality": len(bad_files),
            "quality_ratio": len(good_files) / (len(good_files) + len(bad_files)) if (len(good_files) + len(bad_files)) > 0 else 0
        }
        
        logging.info(f"Overall Statistics: Total files: {stats['total_files']}, Good quality: {stats['good_quality']}, "
                     f"Bad quality: {stats['bad_quality']}, Quality ratio: {stats['quality_ratio']:.2%}")
        
        return stats

    def create_clean_dataset(self):
        """Create clean datasets for each path's cleaned data in separate directories."""
        good_files, _ = self.process_data_files()
        
        for file in good_files:
            file_name = os.path.basename(file)
            df = pd.read_csv(file, header=None)
            
            # Determine the output path based on the source path
            if "data/smartfallmm/old/sk" in file:
                output_path = "data/smartfallmm/old/skeleton"
            elif "data/smartfallmm/young/sk" in file:
                output_path = "data/smartfallmm/young/skeleton"
            else:
                logging.warning(f"{file} does not match any predefined path.")
                continue

            # Ensure the directory exists
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            
            # Save the file in the appropriate directory
            output_file = os.path.join(output_path, file_name)
            df.to_csv(output_file, index=False, header=False)
            logging.info(f"File {file} copied to {output_file}")
        
        logging.info("Clean datasets created in separate directories for each path")


# Set up debugging level logging in console
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths to data directories
paths_to_clean = ["data/smartfallmm/old/sk", "data/smartfallmm/young/sk"]

# Initialize the DataPreprocessor
preprocessor = DataPreprocessor(data_paths=paths_to_clean, log_path="data_cleaning.log")

# Run the cleaning process and store results in designated directories
logging.debug("Starting data processing with detailed debugging...")

# Get statistics and print output
stats = preprocessor.get_data_stats()
logging.debug("Completed quality analysis. Overall statistics retrieved.")

print("\nOverall Statistics:")
print(f"Total files: {stats['total_files']}")
print(f"Good quality: {stats['good_quality']}")
print(f"Bad quality: {stats['bad_quality']}")
print(f"Quality ratio: {stats['quality_ratio']:.2%}")

# Create clean datasets and log actions
preprocessor.create_clean_dataset()
logging.debug("Clean dataset creation completed.")


2024-11-11 01:57:12,488 - INFO - DataPreprocessor initialized
2024-11-11 01:57:12,490 - DEBUG - Starting data processing with detailed debugging...
2024-11-11 01:57:12,504 - INFO - data/smartfallmm/old/sk/S02A06T02.csv passed quality check
2024-11-11 01:57:12,516 - INFO - data/smartfallmm/old/sk/S03A02T04.csv failed quality check
2024-11-11 01:57:12,526 - INFO - data/smartfallmm/old/sk/S19A05T05.csv failed quality check
2024-11-11 01:57:12,533 - INFO - data/smartfallmm/old/sk/S05A02T04.csv passed quality check
2024-11-11 01:57:12,543 - INFO - data/smartfallmm/old/sk/S21A01T04.csv passed quality check
2024-11-11 01:57:12,557 - INFO - data/smartfallmm/old/sk/S16A04T01.csv passed quality check
2024-11-11 01:57:12,577 - INFO - data/smartfallmm/old/sk/S04A03T05.csv failed quality check
2024-11-11 01:57:12,587 - INFO - data/smartfallmm/old/sk/S05A06T04.csv passed quality check
2024-11-11 01:57:12,600 - INFO - data/smartfallmm/old/sk/S07A02T03.csv passed quality check
2024-11-11 01:57:12,617 


Overall Statistics:
Total files: 1344
Good quality: 1199
Bad quality: 145
Quality ratio: 89.21%


2024-11-11 01:57:27,421 - INFO - data/smartfallmm/old/sk/S22A04T01.csv passed quality check
2024-11-11 01:57:27,429 - INFO - data/smartfallmm/old/sk/S18A04T02.csv passed quality check
2024-11-11 01:57:27,436 - INFO - data/smartfallmm/old/sk/S16A02T04.csv passed quality check
2024-11-11 01:57:27,446 - INFO - data/smartfallmm/old/sk/S10A06T05.csv passed quality check
2024-11-11 01:57:27,461 - INFO - data/smartfallmm/old/sk/S11A03T04.csv failed quality check
2024-11-11 01:57:27,470 - INFO - data/smartfallmm/old/sk/S21A07T02.csv passed quality check
2024-11-11 01:57:27,478 - INFO - data/smartfallmm/old/sk/S05A07T02.csv passed quality check
2024-11-11 01:57:27,487 - INFO - data/smartfallmm/old/sk/S16A01T04.csv passed quality check
2024-11-11 01:57:27,495 - INFO - data/smartfallmm/old/sk/S15A01T05.csv passed quality check
2024-11-11 01:57:27,506 - INFO - data/smartfallmm/old/sk/S26A07T02.csv passed quality check
2024-11-11 01:57:27,521 - INFO - data/smartfallmm/old/sk/S07A03T05.csv passed qu

In [4]:
import logging

# Set up debugging level logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths to data directories
paths_to_clean = ["data/old/sk", "data/young/sk"]

# Initialize the DataPreprocessor
preprocessor = DataPreprocessor(data_paths=paths_to_clean, log_path="data_cleaning.log")

# Run the cleaning process and store results in designated directories
logging.debug("Starting fall data processing with detailed debugging...")

# Get statistics and print output
stats, fall_type_stats = preprocessor.get_fall_stats()
logging.debug("Completed quality analysis. Overall statistics and per-fall-type breakdown.")

print("\nOverall Statistics:")
print(f"Total falls: {stats['total_falls']}")
print(f"Good quality: {stats['good_quality']}")
print(f"Bad quality: {stats['bad_quality']}")
print(f"Quality ratio: {stats['quality_ratio']:.2%}")

print("\nStatistics by fall type:")
for fall_type, type_stats in fall_type_stats.items():
    print(f"\n{fall_type.upper()}:")
    print(f"Good samples: {type_stats['good']}")
    print(f"Bad samples: {type_stats['bad']}")
    print(f"Quality ratio: {type_stats['ratio']:.2%}")

# Create clean datasets and log actions
preprocessor.create_clean_dataset()
logging.debug("Clean dataset creation completed.")


2024-11-11 01:52:30,039 - INFO - DataPreprocessor initialized
2024-11-11 01:52:30,042 - DEBUG - Starting fall data processing with detailed debugging...
2024-11-11 01:52:30,074 - INFO - data/old/sk/S02A06T02.csv passed quality check
2024-11-11 01:52:30,101 - INFO - data/old/sk/S03A02T04.csv failed quality check
2024-11-11 01:52:30,124 - INFO - data/old/sk/S19A05T05.csv failed quality check
2024-11-11 01:52:30,142 - INFO - data/old/sk/S05A02T04.csv passed quality check
2024-11-11 01:52:30,157 - INFO - data/old/sk/S21A01T04.csv passed quality check
2024-11-11 01:52:30,187 - INFO - data/old/sk/S16A04T01.csv passed quality check
2024-11-11 01:52:30,217 - INFO - data/old/sk/S04A03T05.csv failed quality check
2024-11-11 01:52:30,234 - INFO - data/old/sk/S05A06T04.csv passed quality check
2024-11-11 01:52:30,253 - INFO - data/old/sk/S07A02T03.csv passed quality check
2024-11-11 01:52:30,270 - INFO - data/old/sk/S06A06T01.csv passed quality check
2024-11-11 01:52:30,284 - INFO - data/old/sk/S0


Overall Statistics:


KeyError: 'total_falls'

In [5]:
import numpy as np
import pandas as pd
from pathlib import Path
import os
import logging

class DataPreprocessor:
    def __init__(self, data_paths, log_path="data_cleaning.log"):
        self.data_paths = data_paths  # List of directories to process

        # Set up logging
        logging.basicConfig(filename=log_path, level=logging.DEBUG,
                            format='%(asctime)s - %(levelname)s - %(message)s')
        logging.info("DataPreprocessor initialized")

    def check_data_quality(self, df, file_name):
        """Check quality of skeleton data with detailed logging."""
        quality_pass = True
        
        # Check for NaN values
        if df.isna().any().any():
            nan_rows = df[df.isna().any(axis=1)]
            logging.warning(f"{file_name} failed quality check due to NaN values in rows {nan_rows.index.tolist()}")
            quality_pass = False
        
        # Check for zero values in multiple consecutive frames
        zero_frames = (df == 0).all(axis=1)
        consecutive_zeros = zero_frames.rolling(window=5, min_periods=5).sum() > 0
        if consecutive_zeros.any():
            zero_rows = consecutive_zeros[consecutive_zeros].index.tolist()
            logging.warning(f"{file_name} failed quality check due to consecutive zero frames in rows {zero_rows}")
            quality_pass = False
            
        # Check sudden large jumps in joint positions
        diffs = np.abs(df.diff())
        large_jump_rows = diffs[(diffs > 1000).any(axis=1)].index.tolist()
        if large_jump_rows:
            logging.warning(f"{file_name} failed quality check due to unrealistic movement in rows {large_jump_rows}")
            quality_pass = False
            
        # Check if total frames are too short
        if len(df) < 50:
            logging.warning(f"{file_name} failed quality check due to insufficient frames ({len(df)} frames)")
            quality_pass = False
            
        return quality_pass

    def process_data_files(self):
        """Process all data files and identify good quality samples with detailed logging."""
        good_files = []
        bad_files = []
        
        for data_path in self.data_paths:
            all_files = list(Path(data_path).glob("*.csv"))
            
            for file in all_files:
                try:
                    df = pd.read_csv(file, header=None)
                    hip_indices = [12, 13]  # Use specific joint indices (e.g., hip) for quality analysis
                    hip_motion = df.iloc[:, np.array([j for i in hip_indices for j in range(i*3, (i+1)*3)])]
                    
                    if self.check_data_quality(hip_motion, file):
                        good_files.append(str(file))
                        logging.info(f"{file} passed quality check")
                    else:
                        bad_files.append(str(file))
                        logging.info(f"{file} failed quality check")
                        
                except Exception as e:
                    logging.error(f"Error processing {file}: {e}")
                    bad_files.append(str(file))

        return good_files, bad_files

    def get_data_stats(self):
        """Get statistics about data quality with detailed logging."""
        good_files, bad_files = self.process_data_files()
        
        # Overall statistics
        stats = {
            "total_files": len(good_files) + len(bad_files),
            "good_quality": len(good_files),
            "bad_quality": len(bad_files),
            "quality_ratio": len(good_files) / (len(good_files) + len(bad_files)) if (len(good_files) + len(bad_files)) > 0 else 0
        }
        
        logging.info(f"Overall Statistics: Total files: {stats['total_files']}, Good quality: {stats['good_quality']}, "
                     f"Bad quality: {stats['bad_quality']}, Quality ratio: {stats['quality_ratio']:.2%}")
        
        return stats

    def create_clean_dataset(self):
        """Create clean datasets for each path's cleaned data in separate directories."""
        good_files, _ = self.process_data_files()
        
        for file in good_files:
            file_name = os.path.basename(file)
            df = pd.read_csv(file, header=None)
            
            # Determine the output path based on the source path
            if "data/old/sk" in file:
                output_path = "data/old/skeletonClean"
            elif "data/young/sk" in file:
                output_path = "data/young/skeletonClean"
            else:
                logging.warning(f"{file} does not match any predefined path.")
                continue

            # Ensure the directory exists
            if not os.path.exists(output_path):
                os.makedirs(output_path)
            
            # Save the file in the appropriate directory
            output_file = os.path.join(output_path, file_name)
            df.to_csv(output_file, index=False, header=False)
            logging.info(f"File {file} copied to {output_file}")
        
        logging.info("Clean datasets created in separate directories for each path")


# Set up debugging level logging in console
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths to data directories
paths_to_clean = ["data/old/sk", "data/young/sk"]

# Initialize the DataPreprocessor
preprocessor = DataPreprocessor(data_paths=paths_to_clean, log_path="data_cleaning.log")

# Run the cleaning process and store results in designated directories
logging.debug("Starting data processing with detailed debugging...")

# Get statistics and print output
stats = preprocessor.get_data_stats()
logging.debug("Completed quality analysis. Overall statistics retrieved.")

print("\nOverall Statistics:")
print(f"Total files: {stats['total_files']}")
print(f"Good quality: {stats['good_quality']}")
print(f"Bad quality: {stats['bad_quality']}")
print(f"Quality ratio: {stats['quality_ratio']:.2%}")

# Create clean datasets and log actions
preprocessor.create_clean_dataset()
logging.debug("Clean dataset creation completed.")


2024-11-11 01:53:44,629 - INFO - DataPreprocessor initialized
2024-11-11 01:53:44,631 - DEBUG - Starting data processing with detailed debugging...
2024-11-11 01:53:44,662 - INFO - data/old/sk/S02A06T02.csv passed quality check
2024-11-11 01:53:44,686 - INFO - data/old/sk/S03A02T04.csv failed quality check
2024-11-11 01:53:44,708 - INFO - data/old/sk/S19A05T05.csv failed quality check
2024-11-11 01:53:44,729 - INFO - data/old/sk/S05A02T04.csv passed quality check
2024-11-11 01:53:44,745 - INFO - data/old/sk/S21A01T04.csv passed quality check
2024-11-11 01:53:44,761 - INFO - data/old/sk/S16A04T01.csv passed quality check
2024-11-11 01:53:44,777 - INFO - data/old/sk/S04A03T05.csv failed quality check
2024-11-11 01:53:44,792 - INFO - data/old/sk/S05A06T04.csv passed quality check
2024-11-11 01:53:44,804 - INFO - data/old/sk/S07A02T03.csv passed quality check
2024-11-11 01:53:44,816 - INFO - data/old/sk/S06A06T01.csv passed quality check
2024-11-11 01:53:44,829 - INFO - data/old/sk/S07A06T


Overall Statistics:
Total files: 1344
Good quality: 1199
Bad quality: 145
Quality ratio: 89.21%


2024-11-11 01:54:01,679 - INFO - data/old/sk/S18A04T02.csv passed quality check
2024-11-11 01:54:01,687 - INFO - data/old/sk/S16A02T04.csv passed quality check
2024-11-11 01:54:01,699 - INFO - data/old/sk/S10A06T05.csv passed quality check
2024-11-11 01:54:01,715 - INFO - data/old/sk/S11A03T04.csv failed quality check
2024-11-11 01:54:01,724 - INFO - data/old/sk/S21A07T02.csv passed quality check
2024-11-11 01:54:01,734 - INFO - data/old/sk/S05A07T02.csv passed quality check
2024-11-11 01:54:01,741 - INFO - data/old/sk/S16A01T04.csv passed quality check
2024-11-11 01:54:01,749 - INFO - data/old/sk/S15A01T05.csv passed quality check
2024-11-11 01:54:01,758 - INFO - data/old/sk/S26A07T02.csv passed quality check
2024-11-11 01:54:01,769 - INFO - data/old/sk/S07A03T05.csv passed quality check
2024-11-11 01:54:01,784 - INFO - data/old/sk/S07A08T03.csv failed quality check
2024-11-11 01:54:01,795 - INFO - data/old/sk/S15A04T04.csv passed quality check
2024-11-11 01:54:01,807 - INFO - data/ol