Create Summary CSVs
This notebook will create .csv files that contain summaries from multiple csvs of behavioral scoring data exported from ELAN. 
#Input: Put all .csv files (i.e. generated for each video and exported using ELAN) in a single directory. The format should be column 1: behavior name, column 2: blank, column 3: start time in sec.msec, column4: end time; column 5: duration. This script assumes 30 minute videos have been scored.
#The output will include a csv for cummulative time spent in a behavior over 5 minute bins in a 30 minute window for each unique behavior scored. It will also include a summary csv with the cumulative duration of each behavior for each video and 

Create seperate CSV with cumulative duration in 5 minute time bins over 30minute session for each behavior

In [2]:
#import relevant packages
import pandas as pd
import numpy as np
import os
import re

#Set Directory containing all CSV files
csv_path = r"/Volumes/avn006/behaviorScoring/202409_FoodDeprivation/female/selfgroom"


Cumulative Duration:
Create a csv with cumulative time spent in each behavior over 5 minute bins over the 30 minute social interaction

In [4]:
# Defines a function to split behavior bouts that span across binned time intervals
def split_behavior_duration(start_time, duration, bins):
        split_durations = []
        
        end_time = start_time + duration
        for i in range(len(bins) - 1):
            bin_start = bins[i]
            bin_end = bins[i + 1]
            
            # Check if behavior overlaps with the current bin
            if start_time < bin_end and end_time > bin_start:
                # Find the overlap between the behavior and the current bin
                overlap_start = max(start_time, bin_start)
                overlap_end = min(end_time, bin_end)
                overlap_duration = overlap_end - overlap_start
                
                # Add the duration for the current bin if there is an overlap
                if overlap_duration > 0:
                    split_durations.append((i, overlap_duration))
        
        return split_durations

def createCumulativeDurationscsv(directory):
    summary_dir = os.path.join(directory, 'summary')
    os.makedirs(summary_dir, exist_ok=True)

    cumulative_data = {}
    all_behaviors = set()

    # First pass: collect all unique behaviors across all files
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, header=None)
            if not df.empty:
                behaviors = df[0].astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True)
                all_behaviors.update(behaviors)

    # Second pass: process each file
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, header=None)
            video_name = os.path.splitext(filename)[0]

            bins = np.arange(0, 1800 + 300, 300)
            bin_labels = [f'{int(b/60)}-{int(b/60 + 5)} min' for b in bins[:-1]]

            # Process file if not empty
            if not df.empty:
                df['behavior'] = df[0].astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True)
                df['start_time'] = pd.to_numeric(df[2], errors='coerce')
                df['duration'] = pd.to_numeric(df[4], errors='coerce')

                cumulative_times = {behavior: np.zeros(len(bin_labels)) for behavior in df['behavior'].unique()}

                for index, row in df.iterrows():
                    behavior = row['behavior']
                    start_time = row['start_time']
                    duration = row['duration']
                    if pd.notnull(start_time) and pd.notnull(duration):
                        split_durations = split_behavior_duration(start_time, duration, bins)
                        for bin_index, bin_duration in split_durations:
                            cumulative_times[behavior][bin_index] += bin_duration

                cumulative_df = pd.DataFrame(cumulative_times, index=bin_labels).T
                cumulative_df = cumulative_df.cumsum(axis=1)
                cumulative_df.insert(0, 'video_name', video_name)
            else:
                # If file is empty, create an empty DataFrame
                cumulative_df = pd.DataFrame(columns=['video_name'] + bin_labels)

            # For every behavior in all_behaviors, ensure a row exists
            for behavior in all_behaviors:
                if behavior in cumulative_df.index:
                    behavior_df = cumulative_df.loc[[behavior]]
                else:
                    # Create a row of zeros for missing behavior
                    row = [video_name] + [0]*len(bin_labels)
                    behavior_df = pd.DataFrame([row], columns=['video_name'] + bin_labels)
                if behavior not in cumulative_data:
                    cumulative_data[behavior] = []
                cumulative_data[behavior].append(behavior_df)

    # Write one CSV per behavior, combining data across all videos
    for behavior, data_list in cumulative_data.items():
        behavior_df = pd.concat(data_list)
        behavior_df = behavior_df.sort_values(by='video_name')
        output_file = f'cumulative_{behavior}_times.csv'
        output_path = os.path.join(summary_dir, output_file)
        behavior_df.to_csv(output_path, index=False)
        print(f"Saved cumulative durations for {behavior} to {output_file}")

createCumulativeDurationscsv(csv_path)

Saved cumulative durations for selfgrooming to cumulative_selfgrooming_times.csv


Total behavior duration:
create csv with total time spent doing each behavior for each video

In [3]:
def createDurationscsv(directory):
    # Create a 'summary' folder inside the input directory for the output CSV
    summary_dir = os.path.join(directory, 'summary')
    os.makedirs(summary_dir, exist_ok=True)

    behavior_durations_per_video = {}

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, header=None)

            # Dictionary to hold total duration for each behavior in the current file
            behavior_total_duration = {}

            # Assuming the columns are: [Behavior, Unknown, Start Time, End Time, Duration]
            for _, row in df.iterrows():
                behavior = re.sub(r'[^a-z]', '', str(row[0]).strip().lower())
                duration = row[4]

                # Update the total duration for each behavior in the current video
                if behavior not in behavior_total_duration:
                    behavior_total_duration[behavior] = 0
                behavior_total_duration[behavior] += duration

            # Store the total durations for this video in the main dictionary
            # Use the filename (without extension) as the key
            behavior_durations_per_video[filename] = behavior_total_duration

    # Create a DataFrame to hold the summary of each video and behavior durations
    all_behaviors = set()
    for behavior_durations in behavior_durations_per_video.values():
        all_behaviors.update(behavior_durations.keys())
    
    all_behaviors = sorted(all_behaviors)  # Sort the behaviors for consistency

    # Create a list of dictionaries where each dictionary contains the video and its behavior durations
    summary_data = []
    for video, behavior_durations in behavior_durations_per_video.items():
        row_data = {'Video': video}
        for behavior in all_behaviors:
            row_data[behavior] = behavior_durations.get(behavior, 0)  # Fill with 0 if behavior not present
        summary_data.append(row_data)

    # Create the DataFrame from the summary data
    summary_df = pd.DataFrame(summary_data)
    summary_df=summary_df.sort_values(by='Video')
    # Save the summary DataFrame to a CSV file inside the 'summary' folder
    summary_csv = os.path.join(summary_dir, 'behavior_durations_summary.csv')
    summary_df.to_csv(summary_csv, index=False)

    print(f"Video behavior durations saved to {summary_csv}")

createDurationscsv(csv_path)


Video behavior durations saved to /Volumes/avn006/behaviorScoring/202409_FoodDeprivation/female/selfgroom/summary/behavior_durations_summary.csv


Latency to behavior:
Create a CSV containing the latency for each behavior
If behavior doesnt occur in a video, assumes 30 minute video and sets latency to 1800s (max)
i.e. take the first start time for each behavior

In [4]:
def createLatenciescsv(directory):
    summary_dir = os.path.join(directory, 'summary')
    os.makedirs(summary_dir, exist_ok=True)

    behavior_latencies_per_video = {}

    all_behaviors = set()

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, header=None)

            behavior_latencies = {}

            for _, row in df.iterrows():
                behavior = re.sub(r'[^a-z]', '', str(row[0]).strip().lower())
                start_time = pd.to_numeric(row[2], errors='coerce')
                if pd.notnull(start_time):
                    if behavior not in behavior_latencies:
                        behavior_latencies[behavior] = start_time
                    else:
                        behavior_latencies[behavior] = min(behavior_latencies[behavior], start_time)
                all_behaviors.add(behavior)

            behavior_latencies_per_video[filename] = behavior_latencies

    # Fill missing behaviors with 1800
    for video, latencies in behavior_latencies_per_video.items():
        for behavior in all_behaviors:
            if behavior not in latencies:
                latencies[behavior] = 1800

    summary_path = os.path.join(summary_dir, 'behavior_latencies_summary.csv')
    summary_df = pd.DataFrame.from_dict(behavior_latencies_per_video, orient='index').sort_index()
    summary_df = summary_df[sorted(summary_df.columns)]  # Optional: sort columns alphabetically
    summary_df.to_csv(summary_path)

    print(f"Latency summary saved to: {summary_path}")

createLatenciescsv(csv_path)

Latency summary saved to: /Volumes/avn006/behaviorScoring/202409_FoodDeprivation/female/selfgroom/summary/behavior_latencies_summary.csv


Bouts of behavior
Create a CSV containing the total # of bouts or episodes of each behavior
Defines a bout as each episode of behavior at least 2 seconds apart (combines <2 seconds into one bout)

In [5]:
def createBoutscsv(directory):
    summary_dir = os.path.join(directory, 'summary')
    os.makedirs(summary_dir, exist_ok=True)

    behavior_bouts_per_video = {}
    all_behaviors = set()

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath, header=None)

             # Normalize behavior names and convert times
            df['behavior'] = df[0].astype(str).str.lower().str.replace(r'[^a-z]', '', regex=True)
            df['start_time'] = pd.to_numeric(df[2], errors='coerce')
            df['end_time'] = pd.to_numeric(df[3], errors='coerce')

            bout_counts = {}
            for behavior in df['behavior'].unique():
                behavior_df = df[df['behavior'] == behavior].sort_values(by='start_time')
                bout_count = 1 if not behavior_df.empty else 0  # Start with 1 if any bouts exist

                for i in range(len(behavior_df) - 1):
                    r1_end = behavior_df.iloc[i]['end_time']
                    r2_start = behavior_df.iloc[i + 1]['start_time']
                    if r2_start - r1_end > 2:
                        bout_count += 1
                bout_counts[behavior] = bout_count
                all_behaviors.add(behavior)

            behavior_bouts_per_video[filename] = bout_counts

    # Ensure every video has a value for every behavior (fill missing with 0)
    summary_data = []
    for video, bout_counts in behavior_bouts_per_video.items():
        row_data = {'Video': video}
        for behavior in sorted(all_behaviors):
            row_data[behavior] = bout_counts.get(behavior, 0)
        summary_data.append(row_data)

    summary_df = pd.DataFrame(summary_data)
    summary_df = summary_df.sort_values(by='Video')
    summary_csv = os.path.join(summary_dir, 'behavior_bouts_summary.csv')
    summary_df.to_csv(summary_csv, index=False)
    print(f"Behavior bouts summary saved to {summary_csv}")

createBoutscsv(csv_path)

Behavior bouts summary saved to /Volumes/avn006/behaviorScoring/202409_FoodDeprivation/female/selfgroom/summary/behavior_bouts_summary.csv
