In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
import os
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
import polars as pl
from scipy.stats import iqr, entropy

# Function

In [2]:
def simple_stat(df):
    target_col = df.columns[6:10] + df.columns[12:14]

    stats_cal = {
        'Column': [],
        'Mean': [],
        'Std Deviation': [],
        '25th Quantile': [],
        'Median': [],
        '75th Quantile': [],
        'IQR': [],
        'Skewness': [],
        'Kurtosis': [],
        'Differential Entropy': []
    }

    for column in target_col:
        data = df[column]
        stats_cal['Column'].append(column)
        stats_cal['Mean'].append(data.mean())
        stats_cal['Std Deviation'].append(data.std())
        stats_cal['25th Quantile'].append(data.quantile(0.25))
        stats_cal['Median'].append(data.median())
        stats_cal['75th Quantile'].append(data.quantile(0.75))
        stats_cal['IQR'].append(iqr(data))
        stats_cal['Skewness'].append(data.skew())
        stats_cal['Kurtosis'].append(data.kurtosis())
        stats_cal['Differential Entropy'].append(entropy(data))

    return pl.DataFrame(stats_cal)

In [3]:
def average_stat(df):
    target_col = df.columns[6:10] + df.columns[12:14]

    df_grouped = df.group_by('PAXDAYWM').agg(
        [pl.col(f"{c}").mean().alias(f"{c}_Mean") for c in target_col]
    ).sort('PAXDAYWM')

    df_var = df_grouped.std().drop('PAXDAYWM')
    df_var.columns = list(map(lambda x: x.replace("Mean", "Std_across_day"), df_var.columns))
    length_repeat = len(df_grouped)
    df_var = df_var.select(pl.exclude('Quantity').repeat_by(length_repeat).explode())
    
    df_final = df_grouped.hstack(df_var)

    return df_final

In [4]:
def act_lv(df):
    df = df.with_columns(
        pl.when(pl.col("PAXMTSM") < 15.9)
        .then("Sedentary_Activity (minutes)")
        .when((pl.col("PAXMTSM") >= 15.9) & (pl.col("PAXMTSM") <= 19.6))
        .then("Light_Activity (minutes)")
        .otherwise("Moderate_Vigorous_Activity (minutes)")
        .alias("ActivityLevel")
    )

    dummy = df.group_by(['PAXDAYWM', 'PAXDAYM','ActivityLevel']).agg(
        (pl.col('PAXTSM').sum()/60).alias('PAXMTSM_avg_day')
    )
    
    dummy = dummy.group_by(['PAXDAYWM','ActivityLevel']).agg(
        (pl.col('PAXMTSM_avg_day').mean()).alias('PAXMTSM_avg_day')
    ) \
    .sort('PAXDAYWM') \
    .pivot(values="PAXMTSM_avg_day", index="PAXDAYWM", columns="ActivityLevel", aggregate_function="sum")

    return dummy

## Variable explain

In [5]:
# https://wwwn.cdc.gov/Nchs/Nhanes/2011-2012/PAXMIN_G.htm#Analytic_Notes

#PAXDAYM - x th day wear
#PAXDAYWM - sun/mon/tue/...?
#PAXSSNMP - Starting data point number for the minute summary record (from the 80hz sampled data)
#PAXTSM - total second in this minute
#PAXAISMM - sleep
#PAXPREDM - sleep/wake/non-wear
#PAXTRANM  - sleep/wake/non-wear
#PAXMTSM - act
#PAXMXM - act
#PAXMYM - act
#PAXMZM - act
#PAXLXMM - Ambient light
#PAXLXSDM  - Ambient light

# Main

In [6]:
csv_path = r"A:\fyp\output\\"

files = os.listdir(csv_path)
csv_files = [f for f in files if f.endswith('.csv')]

In [7]:
for csv_file in csv_files:
    df = pl.read_csv(csv_path + csv_file)
    # Extract Features for each df
    seqn = df['SEQN'][1]
    simple_stat_df = simple_stat(df)
    average_df = average_stat(df)
    act_lv_df = act_lv(df)
    final_df = average_df.join(act_lv_df, on='PAXDAYWM')

print(f"SEQN: {seqn}")
print("")
display(simple_stat_df)
print("")
display(final_df)

SEQN: 83724.0



Column,Mean,Std Deviation,25th Quantile,Median,75th Quantile,IQR,Skewness,Kurtosis,Differential Entropy
str,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""PAXMTSM""",7.203038,9.210318,0.0,2.676,12.734,12.734,1.378154,1.689278,8.559735
"""PAXMXM""",2.337442,3.000021,0.0,0.937,4.094,4.094,1.677668,4.5117,8.572121
"""PAXMYM""",2.209647,3.06661,0.0,0.635,3.677,3.677,2.131912,10.066621,8.498193
"""PAXMZM""",2.655942,3.432947,0.0,0.942,4.584,4.584,1.385171,1.758142,8.54484
"""PAXLXMM""",83.029432,270.30274,0.0,0.0,33.75,33.75,5.18161,30.551231,7.381256
"""PAXLXSDM""",46.493177,143.302641,0.0,0.0,23.17,23.17,4.683475,23.409549,7.478779





PAXDAYWM,PAXMTSM_Mean,PAXMXM_Mean,PAXMYM_Mean,PAXMZM_Mean,PAXLXMM_Mean,PAXLXSDM_Mean,PAXMTSM_Std_across_day,PAXMXM_Std_across_day,PAXMYM_Std_across_day,PAXMZM_Std_across_day,PAXLXMM_Std_across_day,PAXLXSDM_Std_across_day,Sedentary_Activity (minutes),Light_Activity (minutes),Moderate_Vigorous_Activity (minutes)
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
1,7.041083,2.259839,2.116431,2.664817,73.456507,47.639868,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1192.0,85.0,163.0
2,7.715472,2.495824,2.389937,2.829713,117.034785,62.94725,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1151.0,99.0,190.0
3,7.491888,2.447647,2.309921,2.734319,20.866021,14.155736,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1142.0,82.0,216.0
4,8.169442,2.626548,2.544589,2.998291,84.76465,55.941732,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,713.0,75.5,126.5
5,5.944157,1.964738,1.819807,2.159602,109.711857,48.456751,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1054.483333,63.0,132.0
6,7.773369,2.501676,2.373227,2.898456,92.072403,56.236562,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1106.0,112.0,222.0
7,6.949924,2.261616,2.109596,2.578694,63.207146,36.071313,0.73227,0.220978,0.240379,0.275125,32.260094,16.391827,1168.0,90.0,182.0
