##Create Summary CSVs
#This notebook will create .csv files that contain summaries from multiple csvs of behavioral scoring data exported from ELAN software. 
#Input: Place all raw .csv files (i.e. generated for each video and exported using ELAN) in a single directory.
#The output will include a csv for cummulative time spent in a behavior over 5 minute bins in a 30 minute window for each unique behavior scored. It will also include a summary csv with the cumulative duration of each behavior for each video.

Create seperate CSV with cumulative duration in 5 minute time bins over 30minute session for each behavior

In [None]:
import pandas as pd
import numpy as np
import os
from tkinter import Tk, filedialog

# Step 1: Function to get a list of CSV files from a user-specified directory
def get_csv_files_from_directory(directory):
    # Ensure the directory exists
    if not os.path.isdir(directory):
        raise ValueError(f"The directory {directory} does not exist.")
    
    # Get all CSV files in the specified directory
    csv_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    
    # If no CSV files are found, raise an error
    if not csv_files:
        raise ValueError(f"No CSV files found in the directory {directory}.")
    
    return csv_files

# Use Tkinter to interactively select the directory
def select_directory():
    root = Tk()
    root.withdraw()  # Hide the main window
    directory = filedialog.askdirectory(title="Select Directory Containing CSV Files")
    if not directory:
        raise ValueError("No directory selected.")
    return directory

# Input the directory containing the CSV files
directory = select_directory()

# Get the list of CSV files from the specified directory
csv_files = get_csv_files_from_directory(directory)

# Step 2: Create the "summary" folder in the input directory if it doesn't exist
summary_dir = os.path.join(directory, 'summary')
os.makedirs(summary_dir, exist_ok=True)

# Step 3: Create a dictionary to store cumulative data for each behavior
cumulative_data = {}

# Function to split behavior across time bins
def split_behavior_duration(start_time, duration, bins):
    split_durations = []
    
    end_time = start_time + duration
    for i in range(len(bins) - 1):
        bin_start = bins[i]
        bin_end = bins[i + 1]
        
        # Check if behavior overlaps with the current bin
        if start_time < bin_end and end_time > bin_start:
            # Find the overlap between the behavior and the current bin
            overlap_start = max(start_time, bin_start)
            overlap_end = min(end_time, bin_end)
            overlap_duration = overlap_end - overlap_start
            
            # Add the duration for the current bin if there is an overlap
            if overlap_duration > 0:
                split_durations.append((i, overlap_duration))
    
    return split_durations

# Step 3: Iterate through each CSV file
for file in csv_files:
    # Load the CSV file without headers (header=None)
    df = pd.read_csv(file, header=None)
    
    # Extract the video name from the file name (without the .csv extension)
    video_name = os.path.splitext(file)[0]
    
    # Access columns by index (behavior, start_time, duration)
    df['behavior'] = df[0].str.lower()  # Normalize behavior to lowercase
    df['start_time'] = pd.to_numeric(df[2], errors='coerce')  # 3rd column for start time
    df['duration'] = pd.to_numeric(df[4], errors='coerce')    # 5th column for duration

    # Define the time bins (30 minutes split into 5-minute intervals)
    bins = np.arange(0, 1800 + 300, 300)  # Bins for 0-5, 5-10, ..., 25-30 minutes (30*60=1800 seconds)
    bin_labels = [f'{int(b/60)}-{int(b/60 + 5)} min' for b in bins[:-1]]
    
    # Create a dictionary to hold behavior and time-bin cumulative data
    cumulative_times = {behavior: np.zeros(len(bin_labels)) for behavior in df['behavior'].unique()}
    
    # Step 4: Iterate through each row and split behavior durations into bins
    for index, row in df.iterrows():
        behavior = row['behavior']
        start_time = row['start_time']
        duration = row['duration']
        
        if pd.notnull(start_time) and pd.notnull(duration):
            # Get split behavior durations across bins
            split_durations = split_behavior_duration(start_time, duration, bins)
            
            # Add the split durations to the corresponding time bins
            for bin_index, bin_duration in split_durations:
                cumulative_times[behavior][bin_index] += bin_duration
    
    # Step 5: Create a DataFrame to store the cumulative times for this file
    cumulative_df = pd.DataFrame(cumulative_times, index=bin_labels).T

    # Step 6: Apply cumulative sum across time bins **for each behavior individually**
    cumulative_df = cumulative_df.cumsum(axis=1)

    # Step 7: Add the video name as the first column
    cumulative_df.insert(0, 'video_name', video_name)
    
    # Step 8: Append cumulative data for each behavior
    for behavior in cumulative_df.index:
        # Normalize behavior name to avoid case-sensitive conflicts
        normalized_behavior = behavior.lower()

        # If the behavior doesn't exist in the dictionary, initialize it
        if normalized_behavior not in cumulative_data:
            cumulative_data[normalized_behavior] = []
        
        # Append the row (as a DataFrame) for the current behavior
        cumulative_data[normalized_behavior].append(cumulative_df.loc[[behavior]])

# Step 9: Write one CSV per behavior, combining data across all videos
for behavior, data_list in cumulative_data.items():
    # Concatenate all data frames for this behavior into one
    behavior_df = pd.concat(data_list)
    
    # Create a file name based on the normalized behavior
    output_file = f'cumulative_{behavior}_times.csv'

    # Check if the file already exists
    if os.path.exists(output_file):
        # If the file exists, append the data without writing the header
        behavior_df.to_csv(output_file, mode='a', header=False, index=False)
    else:
        # If the file doesn't exist, write the data with the header
        behavior_df.to_csv(output_file, index=False)
    
    # Notify user that the file has been saved or appended
    print(f"Saved cumulative durations for {behavior} to {output_file}")


Create All Behaviors Duration Summary CSV

In [None]:
import os
import pandas as pd
import numpy as np

def process_csv_files(directory):
    # Create a 'summary' folder inside the input directory for the output CSV
    summary_dir = os.path.join(directory, 'summary')
    os.makedirs(summary_dir, exist_ok=True)

    behavior_durations_per_video = {}

    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)

            # Dictionary to hold total duration for each behavior in the current file
            behavior_total_duration = {}

            # Assuming the columns are: [Behavior, Unknown, Start Time, End Time, Duration]
            for _, row in df.iterrows():
                behavior = row[0]
                duration = row[4]

                # Update the total duration for each behavior in the current video
                if behavior not in behavior_total_duration:
                    behavior_total_duration[behavior] = 0
                behavior_total_duration[behavior] += duration

            # Store the total durations for this video in the main dictionary
            # Use the filename (without extension) as the key
            behavior_durations_per_video[filename] = behavior_total_duration

    # Create a DataFrame to hold the summary of each video and behavior durations
    all_behaviors = set()
    for behavior_durations in behavior_durations_per_video.values():
        all_behaviors.update(behavior_durations.keys())
    
    all_behaviors = sorted(all_behaviors)  # Sort the behaviors for consistency

    # Create a list of dictionaries where each dictionary contains the video and its behavior durations
    summary_data = []
    for video, behavior_durations in behavior_durations_per_video.items():
        row_data = {'Video': video}
        for behavior in all_behaviors:
            row_data[behavior] = behavior_durations.get(behavior, 0)  # Fill with 0 if behavior not present
        summary_data.append(row_data)

    # Create the DataFrame from the summary data
    summary_df = pd.DataFrame(summary_data)

    # Save the summary DataFrame to a CSV file inside the 'summary' folder
    summary_csv = os.path.join(summary_dir, 'video_behavior_durations.csv')
    summary_df.to_csv(summary_csv, index=False)

    print(f"Video behavior durations saved to {summary_csv}")

# Ask the user for the input directory containing the CSV files
directory_path = input("Enter the path to the directory containing CSV files: ")

process_csv_files(directory_path)


ModuleNotFoundError: No module named 'pandas'