In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import numpy as np
from sklearn.utils import resample
import os

In [2]:
# Function to convert nanoseconds to HH:MM:SS
def nanoseconds_to_time(nanoseconds):
    hours = nanoseconds // (60 * 60 * 10**9)
    remaining_ns = nanoseconds % (60 * 60 * 10**9)
    minutes = remaining_ns // (60 * 10**9)
    remaining_ns %= (60 * 10**9)
    seconds = remaining_ns // (10**9)
    return f"{int(hours):02}:{int(minutes):02}:{int(seconds):02}"

In [3]:
#Define a function to segment the day
def segment_day(hour):
    if 5 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

In [4]:
def preprocess_data(df):
    df['time_of_day_hms'] = df['time_of_day'].apply(nanoseconds_to_time)
    df['time_of_day_hms'] = pd.to_datetime(df['time_of_day_hms'], format='%H:%M:%S')
    df['day_segment'] = df['time_of_day_hms'].dt.hour.apply(segment_day)
    return df
    

In [32]:
def compute_global_stats(df_id):
    """
    This function computes global statistics (mean, std, mdian) for columns.
    """

    # Filter rows where the watch is worn
    df_filtered = df_id[df_id['non-wear_flag'] == 0].copy()

    # Calculate statistics for the 'enmo' column
    accNorm_mean = df_filtered['enmo'].mean()
    accNorm_std = df_filtered['enmo'].std()
    accNorm_median = df_filtered['enmo'].median()

    # Calculate statistics for the 'anglez' column
    anglez_mean = df_filtered['anglez'].mean()
    anglez_std = df_filtered['anglez'].std()
    anglez_median = df_filtered['anglez'].median()
    # Calculate statistics for the 'anglez' column
    light_mean = df_filtered['light'].mean()
    light_std = df_filtered['light'].std()
    light_median = df_filtered['light'].median()
    # Return a dictionary containing the computed statistics
    stats_dict = {
        'accNorm_mean': accNorm_mean,
        'accNorm_std': accNorm_std,
        'accNorm_median': accNorm_median,
        'anglez_mean': anglez_mean,
        'anglez_std': anglez_std,
        'anglez_median': anglez_median,
        'light_mean': light_mean,
        'light_std': light_std,
        'light_median': light_median,
    }

    return stats_dict

In [34]:
def compute_segmented_stats(df):
    """
    This function calculates segmented statistics (mean, std, median)
    for the variables 'light', 'enmo', and 'anglez', based on four segments of the day:
    morning, afternoon, evening, and night.
    """
    df_filtered = df[df["non-wear_flag"] == 0 ].copy()
    grouped = df_filtered.groupby('day_segment')

    # Compute the statistics for each segment and variable
    stats_dict = {}
    variables = ['light', 'enmo', 'anglez']
    for seg in grouped.groups:
        subset = grouped.get_group(seg)
        for var in variables:
            stats_dict[f"{seg}_{var}_mean"] = subset[var].mean()
            stats_dict[f"{seg}_{var}_std"] = subset[var].std()
            stats_dict[f"{seg}_{var}_median"] = subset[var].median()

    #Return a dictionary of all segmented statistics
    return stats_dict


In [10]:
def find_successive_sequences_with_stats_per_day(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each day (relative_date_PCIAT), detect successive sequences
    that satisfy the condition (e.g., X and Y < 0.5 in absolute value,
    non-wear_flag == 0, etc.), and compute  statistics 
    5on columns.
    
    Returns a DataFrame listing, for each day and each sequence:
    - relative_date_PCIAT: the day in question
    - start_index: the index of the first row in the sequence
    - length: the number of rows in the sequence
    - light, anglez
    - date_start: the date corresponding to the start of the sequence
    """

    # store the results for each day 
    all_days_results = []

    # Group the DataFrame by 'relative_date_PCIAT'y.
    for day, group in df.groupby("relative_date_PCIAT"):

        # Define the condition for this group.
        condition = (
            (group["X"].abs() < 0.5) #inactivity
            & (group["Y"].abs() < 0.5)
            & (group["non-wear_flag"] == 0)
        )

        # Detect the beginning of a sequence when False -> True
        # fill_value=False handles the first row shift
        transitions = ((condition != condition.shift(fill_value=False)) & condition).cumsum()

        # Keep only rows satisfying the condition
        group_temp = group[condition].copy()
        group_temp["bloc_id"] = transitions[condition]

        if group_temp.empty:
            # If no rows satisfy the condition for this day, move on
            continue

        # Calculate statistics on the sub-group
        res = (
            group_temp
            .groupby("bloc_id", as_index=False)
            .agg(
                start_index=("step", "min"),
                length=("step", "size"),
                light=("light", "mean"),
                anglez=("anglez", "mean"),
                date_start=("time_of_day_hms", "first")  # the date at the start of the sequence
            )
        )

        # Add the 'relative_date_PCIAT' column to indicate the day
        res["relative_date_PCIAT"] = day

        # Append these results to our master list
        all_days_results.append(res)

    # Concatenate the results from all days
    if not all_days_results:
        return pd.DataFrame()  # returns an empty DataFrame if there are no sequences at all

    final_result = pd.concat(all_days_results, ignore_index=True)

    # (Optional) sort by relative_date_PCIAT and start_index
    final_result.sort_values(["relative_date_PCIAT", "start_index"], inplace=True)
    final_result.reset_index(drop=True, inplace=True)

    return final_result


In [21]:
def compute_daily_inactivity_stats(sequences: pd.DataFrame, data: pd.DataFrame ) -> pd.DataFrame:
    """
    Computes daily inactivity statistics from a DataFrame of inactivity sequences, while also
    using a second DataFrame to calculate the actual daily wear time. This allows for
    computing 'pct_inactive_day' as a percentage of the total time that the watch was worn.

    Parameters
    ----------
    sequences : pd.DataFrame
        A DataFrame containing inactivity sequences, typically with one row per sequence.
        Required columns include:
          - 'relative_date_PCIAT' (identifier for each day)
          - 'length' (number of consecutive samples in the inactivity sequence, each sample = 5s)
          - 'light' (average light during the sequence)
          - 'anglez' (average angleZ during the sequence)
          - 'non_wear_flag' (0 if the watch is worn, 1 if not worn)
          (Though 'non_wear_flag' may not be strictly required if it's also present in 'data'.)

    data : pd.DataFrame
        A DataFrame containing the more complete set of daily samples (including wear/non-wear),
        from which we derive the total daily wear time. Must include:
          - 'relative_date_PCIAT'
          - 'non_wear_flag'
        so that we can count how many samples in a given day have 'non_wear_flag == 0'.

    Returns:
    A DataFrame (stats_df) grouped by 'relative_date_PCIAT' with the following columns:
      * num_inactive_sequences        (count of inactivity sequences)
      * mean_light_in_sequences
      * max_light_in_sequences
      * std_light_in_sequences
      * median_light_in_sequences
      * mean_anglez_in_sequences
      * max_anglez_in_sequences
      * std_anglez_in_sequences
      * median_anglez_in_sequences
      * total_inactive_duration_min   (total_length converted to minutes)
      * daily_wear_time_min           (actual wearing time that day, in minutes)
      * pct_inactive_day              (percentage of inactivity relative to daily wear time)
    """

    # --------------------------------------------------------------------------
    # 1) First, compute inactivity statistics
    # --------------------------------------------------------------------------
    stats_df = (
        sequences.groupby('relative_date_PCIAT')
        .agg(
            num_inactive_sequences=('length', 'count'),         # Number of inactivity sequences
            total_length=('length', 'sum'),                     # Total sum of 'length'
            
            # Light statistics
            mean_light_in_sequences=('light', 'mean'),
            max_light_in_sequences=('light', 'max'),
            std_light_in_sequences=('light', 'std'),
            median_light_in_sequences=('light', 'median'),
            
            # AngleZ statistics
            mean_anglez_in_sequences=('anglez', 'mean'),
            max_anglez_in_sequences=('anglez', 'max'),
            std_anglez_in_sequences=('anglez', 'std'),
            median_anglez_in_sequences=('anglez', 'median')
        )
        .reset_index()
    )

    # Convert total_length into minutes: total_length * 5 seconds / 60
    stats_df['total_inactive_duration_min'] = stats_df['total_length'] * 5.0 / 60.0

    # --------------------------------------------------------------------------
    # 2) Compute actual daily wear time in minutes (based on non_wear_flag == 0)
    # --------------------------------------------------------------------------
    wear_time_df = (
        data.groupby('relative_date_PCIAT')
        .agg(num_wear_samples=('non-wear_flag', lambda col: (col == 0).sum()))
        .reset_index()
    )
    # Convert samples to minutes (each sample = 5 seconds)
    wear_time_df['daily_wear_time_min'] = wear_time_df['num_wear_samples'] * 5.0 / 60.0

    # Merge wear time into stats_df
    stats_df = stats_df.merge(
        wear_time_df[['relative_date_PCIAT', 'daily_wear_time_min']],
        on='relative_date_PCIAT',
        how='left'
    )

    # --------------------------------------------------------------------------
    # 3) Compute pct_inactive_day = total_inactive_duration_min / daily_wear_time_min * 100
    # --------------------------------------------------------------------------
    def compute_inactive_pct(row):
        wear_time = row['daily_wear_time_min']
        inactive_time = row['total_inactive_duration_min']
        if wear_time > 0:
            return (inactive_time / wear_time) * 100.0
        else:
            return 0.0  # If the watch was never worn that day

    stats_df['pct_inactive_day'] = stats_df.apply(compute_inactive_pct, axis=1)

    # --------------------------------------------------------------------------
    # 4) Drop the 'total_length' column from the final DataFrame
    # --------------------------------------------------------------------------
    stats_df.drop(columns=['total_length'], inplace=True, errors='ignore')

    return stats_df


In [11]:
def compute_global_inactivity_stats(daily_stats: pd.DataFrame) -> dict:
    """
    Aggregates daily statistics across all days into global-level features, 
    returning them in a dictionary instead of a DataFrame.

    Parameters
    ----------
    daily_stats : pd.DataFrame
        A DataFrame that typically contains one row per day, including columns such as:
          - num_inactive_sequences
          - total_length
          - mean_light_in_sequences
          - max_light_in_sequences
          - std_light_in_sequences
          - median_light_in_sequences
          - mean_anglez_in_sequences
          - ...
          - total_inactive_duration_min
          - daily_wear_time_min
          - pct_inactive_day
        and any other daily metrics.

    Returns
    -------
    dict
        A dictionary of aggregated statistics where each numeric column
        from the input is mapped to mean, std, min, and max. The keys in 
        the dictionary follow this naming pattern:
        <original_column>_mean, <original_column>_std, etc.

    Notes
    -----
    - If you have multiple IDs (e.g., multiple participants/users), you could 
      add a groupby on that ID column and apply this aggregation per user.
    - You can customize which statistics are computed or how they're named.
    """

    # Identify all numeric columns in daily_stats
    numeric_cols = daily_stats.select_dtypes(include=['number']).columns
    
    # Exclude 'relative_date_PCIAT' if it happens to be in numeric_cols
    numeric_cols = [col for col in numeric_cols if col != 'relative_date_PCIAT']

    # Prepare a dictionary to hold aggregated results
    aggregated_result = {}

    for col in numeric_cols:
        aggregated_result[f"{col}_mean"] = daily_stats[col].mean()
        aggregated_result[f"{col}_std"] = daily_stats[col].std()
        aggregated_result[f"{col}_min"] = daily_stats[col].min()
        aggregated_result[f"{col}_max"] = daily_stats[col].max()

    return aggregated_result


In [22]:
def process_ids_and_extract_features(data: pd.DataFrame) -> pd.DataFrame:
    """
    For each ID in the input DataFrame 'data', this function:
      1. Checks if the folder 'id={id}' exists.
      2. If it does, reads the file 'part-0.parquet' from that folder.
      3. Applies the following chain of functions in order:
         - preprocess_data(df) -> returns a processed DataFrame.
         - compute_global_stats(df) -> returns a dict of global stats.
         - compute_segmented_stats(df) -> returns a dict of segmented stats.
         - find_successive_sequences_with_stats_per_day(df) -> returns a sequences DataFrame.
         - compute_daily_inactivity_stats(sequences_df) -> returns a daily-stats DataFrame.
         - compute_global_inactivity_stats(daily_stats_df) -> returns a dict of global inactivity stats.
      4. Merges all the dictionaries of results (global stats, segmented stats, global inactivity stats)
         into a single dictionary.
      5. Appends those columns (key/value pairs) into the 'data' DataFrame for that specific ID.

    Parameters
    ----------
    data : pd.DataFrame
        A DataFrame containing at least one column 'id' which uniquely identifies each entity.

    Returns
    -------
    pd.DataFrame
        The same 'data' DataFrame with additional columns containing the computed features.
    """

    for i, row in data.iterrows():
        current_id = row['id']
        folder = f"Downloads/series_train.parquet/id={current_id}"
        parquet_file = os.path.join(folder, "part-0.parquet")

        # Check if the folder/file exists
        if not os.path.exists(parquet_file):
            print(f"Folder or file not found for ID={current_id}. Skipping.")
            continue

        # Read the Parquet file
        try:
            df_parquet = pd.read_parquet(parquet_file)
        except Exception as e:
            print(f"Error reading parquet file for ID={current_id}: {e}")
            continue

        # ---------------------------------------------------------------------
        # 1) Preprocess the data
        # ---------------------------------------------------------------------
        df_processed = preprocess_data(df_parquet)

        # ---------------------------------------------------------------------
        # 2) Compute global stats (returns a dict)
        # ---------------------------------------------------------------------
        global_stats_dict = compute_global_stats(df_processed)

        # ---------------------------------------------------------------------
        # 3) Compute segmented stats (returns a dict)
        # ---------------------------------------------------------------------
        segmented_stats_dict = compute_segmented_stats(df_processed)

        # ---------------------------------------------------------------------
        # 4) Find successive inactivity sequences per day (returns a DataFrame)
        # ---------------------------------------------------------------------
        sequences_df = find_successive_sequences_with_stats_per_day(df_processed)

        # ---------------------------------------------------------------------
        # 5) Compute daily inactivity stats on the sequences (returns a DataFrame)
        # ---------------------------------------------------------------------
        daily_inactivity_df = compute_daily_inactivity_stats(sequences_df, df_processed )

        # ---------------------------------------------------------------------
        # 6) Compute global inactivity stats (returns a dict)
        # ---------------------------------------------------------------------
        global_inactivity_dict = compute_global_inactivity_stats(daily_inactivity_df)

        # ---------------------------------------------------------------------
        # Merge all dict results into one dictionary
        # ---------------------------------------------------------------------
        combined_features = {
            **global_stats_dict,
            **segmented_stats_dict,
            **global_inactivity_dict
        }

        # ---------------------------------------------------------------------
        # Append these new columns into 'data' for the current ID
        # ---------------------------------------------------------------------
        for col_name, col_value in combined_features.items():
            data.at[i, col_name] = col_value

    # Return the updated DataFrame
    return data


In [23]:
data = pd.read_csv("train (1).csv")

In [24]:
data

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,1.0,1.0,0.0,32.0,Winter,35.0,50.0,Fall,1.0,1.0
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,Winter,0.0,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,1.0,0.0,1.0,31.0,Winter,56.0,77.0,Fall,0.0,1.0
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,1.0,1.0,1.0,19.0,Spring,33.0,47.0,Spring,1.0,0.0


In [26]:
final_data = process_ids_and_extract_features(data)

Folder or file not found for ID=00008ff9. Skipping.
Folder or file not found for ID=000fd460. Skipping.
Folder or file not found for ID=00105258. Skipping.
Folder or file not found for ID=0016bb22. Skipping.
Folder or file not found for ID=0038ba98. Skipping.
Folder or file not found for ID=0068a485. Skipping.
Folder or file not found for ID=0069fbed. Skipping.
Folder or file not found for ID=0083e397. Skipping.
Folder or file not found for ID=0087dd65. Skipping.
Folder or file not found for ID=00abe655. Skipping.
Folder or file not found for ID=00ae59c9. Skipping.
Folder or file not found for ID=00af6387. Skipping.
Folder or file not found for ID=00bd4359. Skipping.
Folder or file not found for ID=00c0cd71. Skipping.
Folder or file not found for ID=00d56d4b. Skipping.
Folder or file not found for ID=00d9913d. Skipping.
Folder or file not found for ID=00e6167c. Skipping.
Folder or file not found for ID=00ebc35d. Skipping.
Folder or file not found for ID=00f574e9. Skipping.
Folder or fi

In [27]:
final_data

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,total_inactive_duration_min_min,total_inactive_duration_min_max,daily_wear_time_min_mean,daily_wear_time_min_std,daily_wear_time_min_min,daily_wear_time_min_max,pct_inactive_day_mean,pct_inactive_day_std,pct_inactive_day_min,pct_inactive_day_max
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,0.083333,138.000000,100.071759,152.028051,0.250000,648.166667,55.349947,34.038769,2.127660,100.000000
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,,,,,,,,,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,,,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,89.000000,638.833333,1356.996667,254.763046,403.583333,1440.000000,21.975680,17.158864,8.986020,99.649032
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,15.333333,695.833333,525.013889,423.576140,15.333333,1440.000000,59.657624,36.489241,7.888012,100.000000


In [28]:
data

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,total_inactive_duration_min_min,total_inactive_duration_min_max,daily_wear_time_min_mean,daily_wear_time_min_std,daily_wear_time_min_min,daily_wear_time_min_max,pct_inactive_day_mean,pct_inactive_day_std,pct_inactive_day_min,pct_inactive_day_max
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,,,,,,,
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,0.083333,138.000000,100.071759,152.028051,0.250000,648.166667,55.349947,34.038769,2.127660,100.000000
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,,,,,,,,,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,,,
3957,ffcd4dbd,Fall,11,0,Spring,68.0,Winter,21.441500,60.0,109.8,...,89.000000,638.833333,1356.996667,254.763046,403.583333,1440.000000,21.975680,17.158864,8.986020,99.649032
3958,ffed1dd5,Spring,13,0,Spring,70.0,Winter,12.235895,70.7,87.0,...,15.333333,695.833333,525.013889,423.576140,15.333333,1440.000000,59.657624,36.489241,7.888012,100.000000


In [30]:
final_data[final_data["pct_inactive_day_min"].isnull()]

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,total_inactive_duration_min_min,total_inactive_duration_min_max,daily_wear_time_min_mean,daily_wear_time_min_std,daily_wear_time_min_min,daily_wear_time_min_max,pct_inactive_day_mean,pct_inactive_day_std,pct_inactive_day_min,pct_inactive_day_max
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,,,,,,,,,,
1,000fd460,Summer,9,0,,,Fall,14.035590,48.0,46.0,...,,,,,,,,,,
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,,,,,,,
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,
6,0038ba98,Fall,10,0,,,Fall,19.660760,55.0,84.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3953,ff6c2bb8,Fall,8,0,,,Fall,17.139810,52.5,67.2,...,,,,,,,,,,
3954,ff759544,Summer,7,1,,,Summer,13.927006,48.5,46.6,...,,,,,,,,,,
3955,ff8a2de4,Fall,13,0,Spring,60.0,Fall,16.362460,59.5,82.4,...,,,,,,,,,,
3956,ffa9794a,Winter,10,0,,,Spring,18.764678,53.5,76.4,...,,,,,,,,,,


In [31]:
final_data.to_csv("final_data_train.csv", index= False)