## Preprocess Doppler data from mmWave radar

#### Setup

In [1]:
import sys, os
sys.path.append("../")
from helper.gdrive_downloader import download_with_rclone
from helper.preprocess_utils import export_to_parquet, analyze_data_quality

import polars as pl
import pandas as pd
from glob import glob
import plotly.express as px

#### Download data from Google Drive

In [2]:
remote_path = "https://drive.google.com/drive/folders/1fI7C13G-UNubbeyqzopRXs2d2cwGM0F5"
data_raw = "../data"

# download_with_rclone(remote_path, data_raw)

#### Analyze Doppler data

1. Load the data as dataframes, Sort by date time
2. Identify: Duplicates, missing values, & anomalies
3. Question: Distribution of Rangeidx to understand how far are the objects in the dataset and also to understand bias in distance
    - Same for Doppleridx
4. Distribution of number of objects in the scenes: plot in bins of 5
5. Distribution of peakVal,  in the dataset
6. what is position of objects in the scene?
7. Distribution and bias in orientation of objects in the dataset
8. Distribution and bias in activity and activity class of objects in the dataset

In [None]:
# Export the data to parquet
export_to_parquet(data_raw)

In [3]:
# Load the parquet data as polars dataframes
micro_df = pl.read_parquet("../data/micro_df.parquet")

In [4]:
micro_df.head()

datetime,rangeIdx,dopplerIdx,numDetectedObj,range,peakVal,x_coord,y_coord,doppz,activity,activity_class
datetime[ns],list[i64],list[i64],f64,list[f64],list[i64],list[f64],list[f64],list[list[i64]],i64,str
2023-03-06 22:58:59,"[1, 10, … 63]","[0, 0, … 0]",6.0,"[0.12932, 1.248754, … 7.875]","[6142, 3312, … 1959]","[-0.117188, 0.234375, … 0.0]","[0.0546875, 1.2265625, … 7.875]","[[18095, 16591, … 17600], [18842, 18620, … 17509], … [18550, 18433, … 18326]]",16,"""micro"""
2023-03-06 22:59:00,"[1, 10, … 63]","[0, 0, … 0]",6.0,"[0.12932, 1.249047, … 7.878131]","[6143, 3391, … 1962]","[-0.117188, 0.2734375, … -7.140625]","[0.0546875, 1.21875, … 3.328125]","[[19575, 19419, … 18241], [18494, 18615, … 17190], … [19129, 19013, … 17823]]",16,"""micro"""
2023-03-06 22:59:00,"[1, 42, … 63]","[0, 0, … 0]",7.0,"[0.12932, 5.252098, … 7.878131]","[6140, 1215, … 1960]","[-0.117188, -0.65625, … -7.140625]","[0.0546875, 5.2109375, … 3.328125]","[[15937, 17874, … 16997], [17924, 17291, … 16826], … [18101, 18402, … 17850]]",16,"""micro"""
2023-03-06 22:59:01,"[1, 42, … 63]","[0, 0, … 0]",7.0,"[0.12932, 5.252098, … 7.878131]","[6145, 1136, … 1961]","[-0.117188, -0.65625, … -7.140625]","[0.0546875, 5.2109375, … 3.328125]","[[18342, 17252, … 17881], [18869, 17568, … 18350], … [18495, 18155, … 18521]]",16,"""micro"""
2023-03-06 22:59:01,"[1, 42, … 63]","[0, 0, … 0]",9.0,"[0.12932, 5.252098, … 7.878131]","[6140, 1149, … 1958]","[-0.117188, -0.65625, … -7.140625]","[0.0546875, 5.2109375, … 3.328125]","[[18114, 19102, … 18196], [18289, 19046, … 18045], … [18701, 19002, … 17740]]",16,"""micro"""


#### Identify the data quality issues
1. Handle empty rows
2. Handle duplicate rows
3. Handle rows with zero objects
4. Handle rows with duplicate objects

In [5]:
analyze_data_quality(micro_df, name="Micro Activity Dataset")

Analyzing Micro Activity Dataset
[Info] Shape: (25256, 11)
[Info] Duplicate Rows: 0
[Info] Missing Values per column:
shape: (1, 11)
┌──────────┬──────────┬────────────┬───────────────┬───┬─────────┬───────┬──────────┬──────────────┐
│ datetime ┆ rangeIdx ┆ dopplerIdx ┆ numDetectedOb ┆ … ┆ y_coord ┆ doppz ┆ activity ┆ activity_cla │
│ ---      ┆ ---      ┆ ---        ┆ j             ┆   ┆ ---     ┆ ---   ┆ ---      ┆ ss           │
│ u32      ┆ u32      ┆ u32        ┆ ---           ┆   ┆ u32     ┆ u32   ┆ u32      ┆ ---          │
│          ┆          ┆            ┆ u32           ┆   ┆         ┆       ┆          ┆ u32          │
╞══════════╪══════════╪════════════╪═══════════════╪═══╪═════════╪═══════╪══════════╪══════════════╡
│ 0        ┆ 109      ┆ 109        ┆ 109           ┆ … ┆ 109     ┆ 0     ┆ 0        ┆ 0            │
└──────────┴──────────┴────────────┴───────────────┴───┴─────────┴───────┴──────────┴──────────────┘
[Info] No empty rows found in the DataFrame.
[Info] No rows

In [6]:
# Check if we lost any timestamps
null_times = micro_df.filter(pl.col("datetime").is_null()).height

if null_times > 0:
    print(f"Warning: {null_times} rows have corrupted timestamps.")
    # [Note] Usually best to drop them as time-series data without time is fewer useful
    micro_df = micro_df.drop_nulls(subset=["datetime"])

[TO-DO] Drop rows with zero objects.
Since no rows with zero objects were found, this step is skipped.

#### Identify outliers in the dataset

In [14]:
from helper.outlier_detector import RadarOutlierDetector

detector = RadarOutlierDetector(micro_df)
outlier_df = detector.detect_frame_anomalies(contamination=0.01)    # can be set to "auto"

detector.show_distribution_score().show()

[Info] Training Isolation Forest (n=25256)
[Info] Detection Complete
[Info] Identified 253 anomalous frames (1.00%)


In [15]:
# Visualize Clustering (e.g., Number of Objects vs Intensity)
detector.show_anomalies_scatter(x_axis="numDetectedObj", y_axis="mean_intensity")