In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import sys
sys.path.append('.')
sys.path.append('../')
import os
import torch
import numpy as np
import pandas as pd
import config as cfg
from src.utils.io import get_results_path
import config as config
from src.utils.online_eval import all_online_eval
from collections import Counter

# Script imports
from src.utils.io import (get_results_path,
                          get_metadata_path,
                            get_matched_data_path)

from src.utils.data_prep import (adjust_behavior_and_durations,
                                 )


In [21]:
# Graphing Parameters
import matplotlib as mpl
mpl.rcParams['lines.markersize'] = 12
mpl.rcParams['lines.linewidth'] = 2
mpl.rcParams['xtick.labelsize'] = 25
mpl.rcParams['ytick.labelsize'] = 25
mpl.rcParams["axes.labelsize"] = 25
mpl.rcParams['legend.fontsize'] = 25
mpl.rcParams['axes.titlesize'] = 25
mpl.rcParams['text.usetex'] = True
device = 'cuda:3' if torch.cuda.is_available() else 'cpu'

In [22]:
window_duration = 12.937
window_length = int(window_duration * cfg.SAMPLING_RATE)

In [23]:
def calculate_monthly_behavior_proportions_optimized(data_folder: str, behaviors: list):
    """
    Computes behavior proportions by first getting daily counts from each file
    before combining, making it highly memory-efficient.

    Args:
        data_folder (str): The path to the folder containing the CSV files.
    """
    
    if not os.path.isdir(data_folder):
        print(f"Error: Folder not found at '{data_folder}'")
        return

    # A list to hold the daily count DataFrames from each file
    all_daily_counts = []

    print("Reading files and calculating daily counts...")
    # Loop through all .csv files in the specified folder
    for filename in os.listdir(data_folder):
        if filename.endswith('.csv'):
            file_path = os.path.join(data_folder, filename)
            try:
                filename_stem = os.path.splitext(filename)[0]
                parts = filename_stem.split('_')
                individual_name = parts[0]
                date_str = parts[1]

                # Read only the necessary column
                behavior_col = pd.read_csv(file_path, usecols=['Most probable behavior'])
                
                if behavior_col.empty:
                    continue

                # Calculate the value counts for this file immediately
                daily_counts = behavior_col['Most probable behavior'].value_counts()

                # Convert this small summary Series into a DataFrame
                counts_df = daily_counts.reset_index()
                counts_df.columns = ['behavior', 'count']
                
                # Add the metadata
                counts_df['individual'] = individual_name
                counts_df['date'] = pd.to_datetime(date_str)
                
                # Append the small summary DataFrame to our list
                all_daily_counts.append(counts_df)
                
            except Exception as e:
                print(f"Warning: Could not process file '{file_path.name}'. Reason: {e}")

    if not all_daily_counts:
        print("No valid data files were found to process.")
        return

    # Concatenate the list of daily summaries
    counts_summary_df = pd.concat(all_daily_counts, ignore_index=True)

    # Add a month column for the final aggregation
    counts_summary_df['month'] = counts_summary_df['date'].dt.to_period('M')

    print("\nAggregating counts and calculating final proportions...")
    # Sum the daily counts to get monthly totals
    monthly_totals = counts_summary_df.groupby(
        ['individual', 'month', 'behavior']
    )['count'].sum()

    wide_df = monthly_totals.unstack(level='behavior')
    wide_df = wide_df.reindex(columns=behaviors, fill_value=0)
    wide_df = wide_df.fillna(0)
    final_df = wide_df.reset_index().round(2)

    return final_df

In [24]:
data_folder = os.path.join(cfg.VECTRONICS_BEHAVIOR_EVAL_PATH, 'evals')
monthly_counts = calculate_monthly_behavior_proportions_optimized(data_folder, cfg.RAW_BEHAVIORS)
monthly_proportions = monthly_counts.copy()
# 3. Print the results
if monthly_counts is not None:
    print("\n--- Monthly Behavior Proportions ---")
    row_totals = monthly_proportions[cfg.RAW_BEHAVIORS].sum(axis=1)
    monthly_proportions[cfg.RAW_BEHAVIORS] = monthly_proportions[cfg.RAW_BEHAVIORS].div(row_totals, axis=0).fillna(0)*100
    print(monthly_proportions)


Reading files and calculating daily counts...

Aggregating counts and calculating final proportions...

--- Monthly Behavior Proportions ---
behavior individual    month     Feeding     Moving    Running  Stationary
0               ash  2021-03    0.000000   0.000000   0.000000  100.000000
1               ash  2021-08    1.427320   0.201401   0.000000   98.371278
2               ash  2021-10    0.745714   8.349208   1.038424   89.866654
3               ash  2021-11    1.172513   7.629580   1.339257   89.858650
4               ash  2021-12    1.087518   5.271076   1.302266   92.339140
..              ...      ...         ...        ...        ...         ...
63            palus  2022-06   15.817318  64.604257  19.578425    0.000000
64            palus  2022-07   93.789477   4.851069   1.359454    0.000000
65            palus  2022-08  100.000000   0.000000   0.000000    0.000000
66            palus  2022-09  100.000000   0.000000   0.000000    0.000000
67            palus  2022-10  100.

In [30]:
monthly_proportions_valid = monthly_proportions[(monthly_proportions[cfg.RAW_BEHAVIORS] > 0.05).all(axis=1)]
print("\n--- Monthly Behavior Proportions on Valid Months ---")
print(monthly_proportions_valid)



--- Monthly Behavior Proportions on Valid Months ---
behavior individual    month   Feeding    Moving   Running  Stationary
2               ash  2021-10  0.745714  8.349208  1.038424   89.866654
3               ash  2021-11  1.172513  7.629580  1.339257   89.858650
4               ash  2021-12  1.087518  5.271076  1.302266   92.339140
5               ash  2022-01  0.735552  4.363854  1.685616   93.214978
6               ash  2022-02  1.007428  3.461600  1.035483   94.495489
7               ash  2022-03  1.316724  3.492554  1.215349   93.975372
8               ash  2022-04  0.987513  3.433691  1.064254   94.514542
9               ash  2022-05  1.472397  4.395826  0.989104   93.142672
10              ash  2022-06  0.721172  1.266507  0.321379   97.690941
11              ash  2022-07  0.885547  1.888053  1.071645   96.154755
12              ash  2022-08  1.169179  3.734383  1.932209   93.164229
13              ash  2022-09  0.615497  0.593137  1.462246   97.329120
14              ash  20

In [31]:
def bootstrap_class_distribution(df: pd.DataFrame, behavior_cols: list, alpha: float = 0.05) -> pd.DataFrame:
    """
    Estimates class distribution and constructs a confidence interval using bootstrapping.

    Args:
        df (pd.DataFrame): DataFrame with columns for individual, month, and behavior counts.
        behavior_cols (list): A list of the column names that contain behavior counts.
        n_iterations (int): The number of bootstrap samples to generate.
        alpha (float): The significance level for the confidence interval (e.g., 0.05 for 95% CI).

    Returns:
        pd.DataFrame: A DataFrame containing the overall proportion, and the lower and
                      upper bounds of the confidence interval for each behavior.
    """
    # --- 1. Calculate the Overall Proportion (Point Estimate) ---
    # Sum the counts for each behavior across the entire dataset
    total_counts = df[behavior_cols].sum().values
    # Get the grand total of all observations
    grand_total = total_counts.sum()
    # Calculate the proportion for each behavior
    overall_proportions = total_counts / grand_total

    # --- 2. Perform Bootstrapping to get a distribution of proportions ---
    bootstrap_proportions_list = []

    print(f"Running {len(df)} bootstrap iterations...")
    for i, row in df.iterrows():
        
        # Calculate proportions for this specific bootstrap sample
        sample_total_counts = row[behavior_cols].values
        sample_grand_total = sample_total_counts.sum()
        
        # Avoid division by zero if a sample happens to be empty or all zeros
        if sample_grand_total == 0:
            continue
            
        sample_proportions = sample_total_counts / sample_grand_total

        bootstrap_proportions_list.append(sample_proportions)

    # Convert the list of results into a DataFrame
    bootstrap_df = pd.DataFrame(bootstrap_proportions_list)

    # --- 3. Calculate the Confidence Interval from the bootstrap distribution ---
    lower_quantile = alpha / 2.0  # e.g., 0.05 / 2 = 0.025
    upper_quantile = 1.0 - (alpha / 2.0) # e.g., 1 - 0.025 = 0.975

    # Find the proportions at these percentiles
    ci_lower = bootstrap_df.quantile(lower_quantile)
    ci_upper = bootstrap_df.quantile(upper_quantile)

    # --- 4. Combine results into a final DataFrame ---
    result_df = pd.DataFrame({
        'behavior': behavior_cols,
        'overall_proportion': overall_proportions*100,
        'ci_lower_bound': ci_lower*100,
        'ci_upper_bound': ci_upper*100
    })

    return result_df.round(4) # Round for cleaner output

In [32]:
monthly_counts = monthly_counts[(monthly_counts[cfg.RAW_BEHAVIORS] != 0).all(axis=1)]
result_df = bootstrap_class_distribution(monthly_counts, cfg.RAW_BEHAVIORS, alpha = 0.05)
result_df = result_df.set_index('behavior')
print(result_df)

Running 30 bootstrap iterations...
            overall_proportion  ci_lower_bound  ci_upper_bound
behavior                                                      
Feeding                 1.0243          0.6006          1.5685
Moving                  4.8962          1.0813          7.8812
Running                 1.6199          0.5431          2.8903
Stationary             92.4597         89.4760         97.4286


In [33]:
acc_data = pd.read_csv(get_matched_data_path())
acc_data = adjust_behavior_and_durations(acc_data, cfg.RAW_COLLAPSE_BEHAVIORS_MAPPING, cfg.RAW_BEHAVIORS)

# Group and sum durations in hours
duration_table = np.round(acc_data.groupby('behavior')['duration'].sum().div(3600), 4)

print("DURATION SUMMARY FOR MATCHED ANNOTATIONS")
print(duration_table)


DURATION SUMMARY FOR MATCHED ANNOTATIONS
behavior
Feeding        4.0269
Moving        10.7502
Running        5.4570
Stationary    68.0157
Name: duration, dtype: float64


In [34]:
within_ci = (duration_table >= result_df['ci_lower_bound']) & (duration_table <= result_df['ci_upper_bound'])
for behavior, is_within in within_ci.items():
    print(f"{behavior}: {'Within CI' if is_within else 'Outside CI'}")

Feeding: Outside CI
Moving: Outside CI
Running: Outside CI
Stationary: Outside CI
