# Notebook 1: Extract Z-scored LFP

Brief 1-2 sentence description of notebook.

In [1]:
# Imports of all used packages and libraries
import os
import glob
import sys


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
import spikeinterface.extractors as se
import spikeinterface.preprocessing as sp

In [4]:
import h5py
from scipy.interpolate import interp1d
from scipy.signal import savgol_filter
import matplotlib.cm as cm
import itertools

In [5]:
# setting path
sys.path.append('../../src')

In [6]:
import trodes.read_exported

## Inputs & Data

Explanation of each input and where it comes from.

In [7]:
# Inputs and Required data loading
# input varaible names are in all caps snake case
# Whenever an input changes or is used for processing 
# the vairables are all lower in snake case
OUTPUT_DIR = r"./proc/" # where data is saved should always be shown in the inputs
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [8]:
CHANNEL_MAPPING_DF = pd.read_excel("../../data/channel_mapping.xlsx")
TONE_TIMESTAMP_DF = pd.read_excel("../../data/rce_tone_timestamp.xlsx", index_col=0)

In [9]:
TONE_DIN = "dio_ECU_Din1"
TONE_STATE = 1

In [10]:
EPHYS_SAMPLING_RATE = 20000
LFP_SAMPLING_RATE = 1000
TRIAL_DURATION = 10
FRAME_RATE = 22
ECU_STREAM_ID = "ECU"
TRODES_STREAM_ID = "trodes"
LFP_FREQ_MIN = 0.5
LFP_FREQ_MAX = 300
ELECTRIC_NOISE_FREQ = 60
RECORDING_EXTENTION = "*.rec"

In [11]:
ALL_SESSION_DIR = glob.glob("/scratch/back_up/reward_competition_extention/data/omission/2023_06_18/*.rec")

In [12]:
VIDEO_TO_FRAME_AND_SUBJECT_DF = pd.read_excel("../../data/video_to_frame_and_subject.xlsx")
SLEAP_DIR = "/scratch/back_up/reward_competition_extention/proc/id_corrected"
OUTPUT_DIR = r"./proc" # where data is saved should always be shown in the inputs
MED_PC_WIDTH = 29.5
MED_PC_HEIGHT = 24
FRAME_RATE = 22

## Outputs

Describe each output that the notebook creates. 

- Is it a plot or is it data?

- How valuable is the output and why is it valuable or useful?

# Functions

In [13]:
def compute_sorted_index(group, value_column='Value', index_column='SortedIndex'):
    """ 
    Computes the index of each row's value within its sorted group.

    Parameters:
    - group (pd.DataFrame): A group of data.
    - value_column (str): Name of the column containing the values to be sorted.
    - index_column (str): Name of the new column that will contain the indices.

    Returns:
    - pd.DataFrame: The group with an additional column containing the indices.
    """
    sorted_values = sorted(list(set(group[value_column].tolist())))
    group[index_column] = group[value_column].apply(lambda x: sorted_values.index(x))
    return group

In [14]:
def find_closest_index(sorted_list=None, target=0):
    """
    Returns the index of the number in the sorted list that is closest to the target.

    This function performs a binary search on a sorted list to find the closest number to 
    a given target. If the target exists in the list, its index is returned. If not, the 
    function will return the index of the number that's closest to the target.

    Parameters:
    - sorted_list (list[int or float]): A sorted list of numbers.
    - target (int or float): The target number to find the closest value to.

    Returns:
    - int: The index of the closest number in the sorted list to the target. 
           If the sorted list is empty, returns None.

    Example:
    >>> sorted_nums = [1, 3, 5, 8, 10, 15, 18, 20, 24, 27, 30]
    >>> find_closest_index(sorted_nums, 6)
    2

    Note:
    The list should be sorted in ascending order.
    """
    
    if sorted_list is None:
        return None
    if target <= sorted_list[0]:
        return 0
    if target >= sorted_list[-1]:
        return len(sorted_list) - 1

    # Binary search
    left, right = 0, len(sorted_list) - 1
    while left <= right:
        mid = (left + right) // 2

        if sorted_list[mid] == target:
            return mid
        elif sorted_list[mid] < target:
            left = mid + 1
        else:
            right = mid - 1

    # After binary search, the target will be between sorted_list[right] and sorted_list[left]
    # We compare the two to see which one is closer to the target and return its index
    if abs(sorted_list[left] - target) < abs(sorted_list[right] - target):
        return left
    else:
        return right

In [15]:
def get_sleap_tracks_from_h5(filename):
    """
    Retrieve pose tracking data (tracks) from a SLEAP-generated h5 file.
    
    This function is intended for use with Pandas' apply method on columns containing filenames.
    
    Parameters:
    ----------
    filename : str
        Path to the SLEAP h5 file containing pose tracking data.
        
    Returns:
    -------
    np.ndarray
        A transposed version of the 'tracks' dataset in the provided h5 file.
        
    Example:
    --------
    df['tracks'] = df['filename_column'].apply(get_sleap_tracks_from_h5)
    
    """
    with h5py.File(filename, "r") as f:
        return f["tracks"][:].T

In [16]:
def get_sleap_track_names_from_h5(filename):
    """
    Retrieve the names of tracked features from a SLEAP-generated h5 file.
    
    This function is intended for use with Pandas' apply method on columns containing filenames.
    
    Parameters:
    ----------
    filename : str
        Path to the SLEAP h5 file containing pose tracking data.
        
    Returns:
    -------
    h5py.Dataset
        The 'track_names' dataset in the provided h5 file, representing the names of the tracked features.
        
    Example:
    --------
    df['track_names'] = df['filename_column'].apply(get_sleap_track_names_from_h5)
    
    """
    with h5py.File(filename, "r") as f:
        return [item.tobytes().decode('utf-8') for item in f["track_names"][:]]


In [17]:
def get_node_names_from_sleap(filename):
    """
    Retrieve node names from a SLEAP h5 file.

    Parameters:
    - filename (str): Path to the SLEAP h5 file.

    Returns:
    - list of str: List of node names.
    """
    with h5py.File(filename, "r") as f:
        return [n.decode() for n in f["node_names"][:]]

In [18]:

def fill_missing(Y, kind="linear"):
    """Fills missing values independently along each dimension after the first."""

    # Store initial shape.
    initial_shape = Y.shape

    # Flatten after first dim.
    Y = Y.reshape((initial_shape[0], -1))

    # Interpolate along each slice.
    for i in range(Y.shape[-1]):
        y = Y[:, i]

        # Build interpolant.
        x = np.flatnonzero(~np.isnan(y))
        f = interp1d(x, y[x], kind=kind, fill_value=np.nan, bounds_error=False)

        # Fill missing
        xq = np.flatnonzero(np.isnan(y))
        y[xq] = f(xq)
        
        # Fill leading or trailing NaNs with the nearest non-NaN values
        mask = np.isnan(y)
        y[mask] = np.interp(np.flatnonzero(mask), np.flatnonzero(~mask), y[~mask])

        # Save slice
        Y[:, i] = y

    # Restore to initial shape.
    Y = Y.reshape(initial_shape)

    return Y

In [19]:
def compute_velocity(node_loc, window_size=25, polynomial_order=3):
    """
    Calculate the velocity of tracked nodes from pose data.
    
    The function utilizes the Savitzky-Golay filter to smooth the data and compute the velocity.
    
    Parameters:
    ----------
    node_loc : numpy.ndarray
        The location of nodes, represented as an array of shape [frames, 2]. 
        Each row represents x and y coordinates for a particular frame.
        
    window_size : int, optional
        The size of the window used for the Savitzky-Golay filter. 
        Represents the number of consecutive data points used when smoothing the data.
        Default is 25.
        
    polynomial_order : int, optional
        The order of the polynomial fit to the data within the Savitzky-Golay filter window.
        Default is 3.

    Returns:
    -------
    numpy.ndarray
        The velocity for each frame, calculated from the smoothed x and y coordinates.
    
    """
    node_loc_vel = np.zeros_like(node_loc)
    
    # For each coordinate (x and y), smooth the data and calculate the derivative (velocity)
    for c in range(node_loc.shape[-1]):
        node_loc_vel[:, c] = savgol_filter(node_loc[:, c], window_size, polynomial_order, deriv=1)
    
    # Calculate the magnitude of the velocity vectors for each frame
    node_vel = np.linalg.norm(node_loc_vel, axis=1)

    return node_vel

In [20]:
def extract_sleap_data(filename):
    """
    Extracts coordinates, names of body parts, and track names from a SLEAP file.
    
    Parameters:
    - filename (str): Path to the SLEAP file.
    
    Returns:
    - tuple: A tuple containing the following elements:
        * location (numpy.ndarray): Array containing the coordinates.
        * node_names (list of str): List of body part names.
        * track_names (list of str): List of track names.
    
    Example:
    >>> location, node_names, track_names = extract_sleap_data("path/to/sleap/file.h5")
    """
    result = {}
    with h5py.File(filename, "r") as f:
        result["location"] = f["tracks"][:].T
        result["node_names"] = [n.decode() for n in f["node_names"][:]]
        result["track_names"] = [n.decode() for n in f["track_names"][:]]

    return result

In [21]:
def rescale_dimension_in_array(arr, dimension=0, ratio=1):
    """
    Rescale values of a specified dimension in a 3D numpy array for the entire array.
    
    Parameters:
    - arr (numpy.ndarray): A 3D numpy array where the third dimension is being rescaled.
    - dimension (int, default=0): Specifies which dimension (0 or 1) of the third 
                                  dimension in the array should be rescaled. 
                                  For instance, in many contexts:
                                  0 represents the x-coordinate, 
                                  1 represents the y-coordinate.
    - ratio (float, default=1): The scaling factor to be applied.
    
    Returns:
    - numpy.ndarray: The rescaled array.
    """
    
    arr[:,:,dimension] *= ratio
    return arr

In [22]:
def rolling_average(arr, window_size):
    """
    Computes the rolling average using a specified window size.
    
    Parameters:
        arr (numpy.array): The input array to compute the rolling average for.
        window_size (int): The size of the rolling window.

    Returns:
        numpy.array: The rolling average of the input array.
    """
    if window_size < 1:
       raise ValueError("Window size must be at least 1.")
    
    # Create a uniform window of given window size
    window = np.ones(window_size) / window_size

    # Use numpy's convolve function to compute the rolling average
    return np.convolve(arr, window, mode='valid')



In [23]:
def chunked_average(arr, chunk_size):
    """
    Computes the average for non-overlapping chunks of the input array.
    
    Parameters:
        arr (numpy.array): The input array.
        chunk_size (int): The size of each chunk.

    Returns:
        numpy.array: The averages of the non-overlapping chunks.
    """

    # Number of chunks
    num_chunks = len(arr) // chunk_size
    
    # Reshape the array into a 2D array of shape (num_chunks, chunk_size)
    reshaped_arr = arr[:num_chunks * chunk_size].reshape(num_chunks, chunk_size)
    
    # Compute the mean along the second axis (i.e., for each chunk)
    return reshaped_arr.mean(axis=1)

In [24]:
def sliding_window_average(arr, window_size, step=1):
    """
    Apply a sliding window to a 1D numpy array, returning the average of windows of a specified size.

    :param arr: Input 1D numpy array.
    :param window_size: Size of the window.
    :param step: The step size or number of elements to slide the window by. Default is 1.
    :return: A 1D numpy array where each element is the average of a window from the input.
    """
    # Number of windows
    num_windows = ((arr.size - window_size) // step) + 1
    
    # Output array for averages
    averages = np.zeros(num_windows)
    
    for i in range(num_windows):
        # Calculate the start and end index for the window
        start = i * step
        end = start + window_size
        # Calculate the average of the window
        averages[i] = np.mean(arr[start:end])

    return averages

In [25]:
def calculate_all_window_indices(original_index, window_size, step, array_length):
    """
    Calculate all the start and stop indices for sliding windows based on an original start index.

    :param original_index: The original index from which the first window should start.
    :param window_size: The size of each sliding window.
    :param step: The step size or number of elements to slide the window by.
    :param array_length: The total number of elements in the array.
    :return: A list of tuples, each containing the start and stop indices for a sliding window.
    """

    # Initialize the list to hold the start and stop indices for all windows
    windows = []

    # Initialize the current start index with the original index
    current_start_index = original_index

    # Loop through the array until the end is reached
    while current_start_index + window_size <= original_index + array_length:
        # Calculate the stop index based on the window size
        stop_index = current_start_index + window_size

        # Add the start and stop indices to the list
        windows.append((current_start_index, stop_index))

        # Update the current start index by adding the step size
        current_start_index += step

    return windows

## Processing

Describe what is done to the data here and how inputs are manipulated to generate outputs. 

In [26]:
# As much code and as many cells as required
# includes EDA and playing with data
# GO HAM!

# Ideally functions are defined here first and then data is processed using the functions

# function names are short and in snake case all lowercase
# a function name should be unique but does not have to describe the function
# doc strings describe functions not function names




# Electrophysiology

## Getting timestamps for each spikegadgets sample

In [27]:
session_to_dir = {}
# Going through each session recording
# Which includes all the recordings from all the miniloggers and cameras
for session_path in ALL_SESSION_DIR:   
    try:
        session_basename = os.path.splitext(os.path.basename(session_path))[0]
        print("Current Session: {}".format(session_basename))

        session_to_dir[session_basename] = trodes.read_exported.organize_all_trodes_export(session_path)
    except Exception as e: 
        print(e)

Current Session: 20230618_100646_standard_comp_to_omission_D2_subj_2-4_and_2-1
Current Session: 20230618_100636_standard_comp_to_omission_D2_subj_1-4_and_1-1
Skipping file 20230618_100636_standard_comp_to_omission_D2_subj_1_4_t4b3L_box1_merged.timestampoffset.txt due to error: Settings format not supported


  return np.dtype(dtype_spec)


Skipping file 20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged.timestampoffset.txt due to error: Settings format not supported


In [29]:
session_to_recording_to_timestamps = []
for dir, rec_dict in session_to_dir.items():
    for rec_file, value in rec_dict.items():
        voltage_timestamp_array = np.array([lst[0] for lst in np.array(value["raw"]["timestamps"]["data"])])
        voltage_index_array = voltage_timestamp_array - voltage_timestamp_array[0]
        
        session_to_recording_to_timestamps.append({"recording_session": dir, "ephys_name": rec_file, "voltage_timestamps": voltage_timestamp_array, "voltage_indexes": voltage_index_array})

In [30]:
recording_sessions_df = pd.DataFrame(session_to_recording_to_timestamps)

In [31]:
recording_sessions_df

Unnamed: 0,recording_session,ephys_name,voltage_timestamps,voltage_indexes
0,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."
1,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,..."


In [35]:
recording_sessions_df["ephys_name"].iloc[0]

'20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged'

In [38]:
recording_sessions_df["subject_id"] = recording_sessions_df["ephys_name"].apply(lambda x: x.split("subj")[-1].strip("_").split("t")[0].strip("_").replace("-", ".").replace("_", "."))
                                                                                # .split("_")[0].replace("-", "."))

In [39]:
recording_sessions_df["subject_id"] 

0    1.1
1    1.4
Name: subject_id, dtype: object

In [40]:
session_to_recording_to_timestamps

[{'recording_session': '20230618_100636_standard_comp_to_omission_D2_subj_1-4_and_1-1',
  'ephys_name': '20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged',
  'voltage_timestamps': array([  835680,   835681,   835682, ..., 66439619, 66439620, 66439621],
        dtype=uint32),
  'voltage_indexes': array([       0,        1,        2, ..., 65603939, 65603940, 65603941],
        dtype=uint32)},
 {'recording_session': '20230618_100636_standard_comp_to_omission_D2_subj_1-4_and_1-1',
  'ephys_name': '20230618_100636_standard_comp_to_omission_D2_subj_1_4_t4b3L_box1_merged',
  'voltage_timestamps': array([  835680,   835681,   835682, ..., 69429290, 69429291, 69429292],
        dtype=uint32),
  'voltage_indexes': array([       0,        1,        2, ..., 68593610, 68593611, 68593612],
        dtype=uint32)}]

In [41]:
recording_sessions_df["ephys_name"].iloc[0]

'20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged'

- Adding the ephys channels

In [42]:
CHANNEL_MAPPING_DF

Unnamed: 0,Cohort,Subject,eib_mPFC,eib_vHPC,eib_BLA,eib_LH,eib_MD,spike_interface_mPFC,spike_interface_vHPC,spike_interface_BLA,spike_interface_LH,spike_interface_MD
0,1,6.1,,15,14,13,31,21.0,15.0,14.0,13.0,16.0
1,1,6.2,,15,14,13,31,,,,,
2,1,6.3,,15,14,13,31,,,,,
3,1,6.4,,15,14,13,31,,,,,
4,2,1.1,,16,17,18,19,5.0,31.0,30.0,29.0,28.0
5,2,1.2,,31,30,29,28,10.0,31.0,30.0,29.0,28.0
6,2,1.3,,15,14,13,12,9.0,31.0,30.0,29.0,28.0
7,2,1.4,,15,14,13,12,15.0,31.0,30.0,29.0,28.0


In [43]:
CHANNEL_MAPPING_DF = CHANNEL_MAPPING_DF.drop(columns=[col for col in CHANNEL_MAPPING_DF if "eib" in col], errors="ignore")

- Adding all the brain region to ch information

In [44]:
CHANNEL_MAPPING_DF["Subject"] = CHANNEL_MAPPING_DF["Subject"].astype(str)

In [45]:
recording_sessions_df = pd.merge(left=recording_sessions_df, left_on="subject_id", right=CHANNEL_MAPPING_DF, right_on="Subject")

In [46]:
recording_sessions_df

Unnamed: 0,recording_session,ephys_name,voltage_timestamps,voltage_indexes,subject_id,Cohort,Subject,spike_interface_mPFC,spike_interface_vHPC,spike_interface_BLA,spike_interface_LH,spike_interface_MD
0,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1.1,2,1.1,5.0,31.0,30.0,29.0,28.0
1,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1.4,2,1.4,15.0,31.0,30.0,29.0,28.0


In [47]:
recording_sessions_df["ephys_name"].iloc[0]

'20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged'

In [48]:
recording_sessions_df["voltage_timestamps"].iloc[0].shape

(65603942,)

# Extracting the LFP

In [49]:
recording_name_to_all_ch_lfp = {}
# Going through all the recording sessions 
for session_dir in ALL_SESSION_DIR:
    # Going through all the recordings in each session
    for recording_path in glob.glob(os.path.join(session_dir, RECORDING_EXTENTION)):
        try:
            recording_basename = os.path.splitext(os.path.basename(recording_path))[0]
            # checking to see if the recording has an ECU component
            # if it doesn't, then the next one be extracted
            current_recording = se.read_spikegadgets(recording_path, stream_id=ECU_STREAM_ID)
            current_recording = se.read_spikegadgets(recording_path, stream_id=TRODES_STREAM_ID)
            print(recording_basename)
            # Preprocessing the LFP
            current_recording = sp.bandpass_filter(current_recording, freq_min=LFP_FREQ_MIN, freq_max=LFP_FREQ_MAX)
            current_recording = sp.notch_filter(current_recording, freq=ELECTRIC_NOISE_FREQ)
            current_recording = sp.resample(current_recording, resample_rate=LFP_SAMPLING_RATE)
            current_recording = sp.zscore(current_recording)
            recording_name_to_all_ch_lfp[recording_basename] = current_recording
        except Exception as error:
            # handle the exception
            print("An exception occurred:", error) # An exception occurred: division by zero




An exception occurred: stream_id trodes is not in ['ECU']
20230618_100636_standard_comp_to_omission_D2_subj_1_4_t4b3L_box1_merged
20230618_100636_standard_comp_to_omission_D2_subj_1_1_t1b2L_box2_merged
An exception occurred: stream_id trodes is not in ['ECU']


In [50]:
recording_sessions_df["all_ch_lfp"] = recording_sessions_df["ephys_name"].map(recording_name_to_all_ch_lfp)

In [51]:
recording_sessions_df

Unnamed: 0,recording_session,ephys_name,voltage_timestamps,voltage_indexes,subject_id,Cohort,Subject,spike_interface_mPFC,spike_interface_vHPC,spike_interface_BLA,spike_interface_LH,spike_interface_MD,all_ch_lfp
0,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1.1,2,1.1,5.0,31.0,30.0,29.0,28.0,ZScoreRecording: 32 channels - 1.0kHz - 1 segm...
1,20230618_100636_standard_comp_to_omission_D2_s...,20230618_100636_standard_comp_to_omission_D2_s...,"[835680, 835681, 835682, 835683, 835684, 83568...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",1.4,2,1.4,15.0,31.0,30.0,29.0,28.0,ZScoreRecording: 32 channels - 1.0kHz - 1 segm...


- Getting the LFP for each brain region

In [None]:
region_columns = [col for col in recording_sessions_df if "spike_interface" in col]

In [None]:
region_columns

In [None]:
for col in region_columns:
    recording_sessions_df[col] = recording_sessions_df[col].astype(int).astype(str)
    region = col.split("_")[-1]
    print(region)
    recording_sessions_df["{}_lfp_trace".format(region)] = recording_sessions_df.apply(lambda row: row["all_ch_lfp"].get_traces(channel_ids=[row[col]]).T[0], axis=1)

In [None]:
recording_sessions_df = recording_sessions_df.drop(columns=["all_ch_lfp"], errors="ignore")

In [None]:
trace_columns = [col for col in recording_sessions_df if "trace" in col]

In [None]:
recording_sessions_df["lfp_indexes"] = recording_sessions_df[trace_columns[0]].apply(lambda x: np.arange(0, 20 * x.shape[0] + 1, 20))

In [None]:
recording_sessions_df["lfp_indexes"].iloc[0]

In [None]:
recording_sessions_df["voltage_indexes"].iloc[0]

In [None]:
recording_sessions_df

- Getting the timestamps of each LFP sample

In [None]:
recording_sessions_df["lfp_timestamps"] = recording_sessions_df.apply(lambda x: x["voltage_timestamps"][0:20 * x["mPFC_lfp_trace"].shape[0]:20], axis=1)

In [None]:
recording_sessions_df

# Get the video frames

# Reading in the h5 files between recordings

In [52]:
VIDEO_TO_FRAME_AND_SUBJECT_DF = VIDEO_TO_FRAME_AND_SUBJECT_DF.dropna(subset="start_frame")

In [53]:
VIDEO_TO_FRAME_AND_SUBJECT_DF.head()

Unnamed: 0,file_path,start_frame,stop_frame,individual_subj,all_subj
1,/scratch/back_up/reward_competition_extention/...,1.0,25000.0,6.3,6.1_6.3
2,/scratch/back_up/reward_competition_extention/...,27500.0,73601.0,6.1_6.3,6.1_6.3
3,/scratch/back_up/reward_competition_extention/...,51500.0,76455.0,6.3,6.1_6.3
4,/scratch/back_up/reward_competition_extention/...,1.0,48500.0,6.1_6.3,6.1_6.3
5,/scratch/back_up/reward_competition_extention/...,41000.0,79051.0,1.1,1.1_1.2


In [54]:
# We will be indexing for the frames, so they must be integers and not floats
VIDEO_TO_FRAME_AND_SUBJECT_DF["start_frame"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["start_frame"].astype(int)
VIDEO_TO_FRAME_AND_SUBJECT_DF["stop_frame"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["stop_frame"].astype(int)

In [None]:
# Getting the basename of the file which corresponds to the ephys recording that the h5 file came from
VIDEO_TO_FRAME_AND_SUBJECT_DF["video_name"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: ".".join(os.path.basename(x).split(".")[:2]))
VIDEO_TO_FRAME_AND_SUBJECT_DF["recording_name"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: os.path.basename(x).split(".")[0])

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["all_sleap_data"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: extract_sleap_data(x))


In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["location"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["all_sleap_data"].apply(lambda x: x["location"])

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["track_names"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["all_sleap_data"].apply(lambda x: x["track_names"])

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF.head()

In [None]:
# Getting the coordinates of all the body parts for all the animals for the entire recording
VIDEO_TO_FRAME_AND_SUBJECT_DF["location"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: get_sleap_tracks_from_h5(x))
# Getting the name of the tracks which correspond to the animal id
VIDEO_TO_FRAME_AND_SUBJECT_DF["track_names"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: get_sleap_track_names_from_h5(x))

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["location"].iloc[0].shape

In [None]:
# Making sure all the subject IDs are strings instead of floating point numbers
VIDEO_TO_FRAME_AND_SUBJECT_DF["individual_subj"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["individual_subj"].astype(str)

In [None]:
# Getting the indexes of each subject from the track list
VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_index"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: {k: x["track_names"].index(k) for k in x["individual_subj"].split("_")}, axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_index"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_tracks"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_index"].copy()#.apply(lambda x: {k:v for k, v in x["subject_to_index"].items()}, axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_tracks"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: {k: x["location"][:,:,:,v] for k, v in x["subject_to_index"].items()}, axis=1)

In [None]:
# Convert dictionary items to list of items
VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_and_tracks_list"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_to_tracks"].apply(lambda x: list(x.items()))

In [None]:
# Explode based on the lists
VIDEO_TO_FRAME_AND_SUBJECT_DF = VIDEO_TO_FRAME_AND_SUBJECT_DF.explode(["subject_and_tracks_list"]).reset_index(drop=True)

In [None]:
# Split tuple of (key, value) into separate columns
VIDEO_TO_FRAME_AND_SUBJECT_DF[['subject_id', "full-recording_subject_location_all-frames_original"]] = pd.DataFrame(VIDEO_TO_FRAME_AND_SUBJECT_DF["subject_and_tracks_list"].tolist(), index=VIDEO_TO_FRAME_AND_SUBJECT_DF.index)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["agent_id"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: [subj for subj in x["individual_subj"].split("_") if x["subject_id"] != subj], axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["agent_id"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["agent_id"].apply(lambda x: x[0] if x else "")

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_agent_location_all-frames_original"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: x["subject_to_tracks"].get(x["agent_id"], np.nan), axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF.head()

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF.head()

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF = VIDEO_TO_FRAME_AND_SUBJECT_DF.drop(columns=["subject_to_tracks"])

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF.head()

## Getting the coordinates of the corners

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].iloc[0]

In [None]:
# Each corner file is the in the same folder and has the same basename of the pose tracking file 
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_path"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].apply(lambda x: "{}.fixed.corner.h5".format(x.split("fixed")[0].strip(".")))

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_path"].iloc[0]

In [None]:
# Getting the indexes of each corner location
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_parts"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_path"].apply(lambda x: get_node_names_from_sleap(x))

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_parts"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["video_name"].iloc[11]

In [None]:
# Getting the coordinates of all the corners
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_path"].apply(lambda x: get_sleap_tracks_from_h5(x))

In [None]:
# Parsing out each corner and creating a dictionary of name to coordinates
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: {part: x["corner_to_coordinate"][:,index,:,:] for index, part in enumerate(x["corner_parts"])}, axis=1)

In [None]:
# Filtering out all the Nans because there's only one labeled frame
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: {k: v[~np.isnan(v)] for k, v in x["corner_to_coordinate"].items()}, axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].iloc[0]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].iloc[5]

# Getting the distances between corners

- Getting the average width and height so that we can convert pixels to cm

In [None]:
# Using the x-coordinates for the width
VIDEO_TO_FRAME_AND_SUBJECT_DF["bottom_width"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].apply(lambda x: x["box_bottom_right"][0] - x["box_bottom_left"][0])
VIDEO_TO_FRAME_AND_SUBJECT_DF["top_width"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].apply(lambda x: x["box_top_right"][0] - x["box_top_left"][0])


In [None]:
# Using the y-coordinates for the height
VIDEO_TO_FRAME_AND_SUBJECT_DF["right_height"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].apply(lambda x: x["box_bottom_right"][1] - x["box_top_right"][1])
VIDEO_TO_FRAME_AND_SUBJECT_DF["left_height"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"].apply(lambda x: x["box_bottom_left"][1] - x["box_top_left"][1])


In [None]:
# averaging the width and height by adding both sides and then getting the mean
VIDEO_TO_FRAME_AND_SUBJECT_DF["average_height"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda row: (row["right_height"] + row["left_height"])/2, axis=1)
VIDEO_TO_FRAME_AND_SUBJECT_DF["average_width"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda row: (row["bottom_width"] + row["top_width"])/2, axis=1)

- Getthing the pixel to cm ratio

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["width_ratio"] = MED_PC_WIDTH / VIDEO_TO_FRAME_AND_SUBJECT_DF["average_width"]
VIDEO_TO_FRAME_AND_SUBJECT_DF["height_ratio"] = MED_PC_HEIGHT / VIDEO_TO_FRAME_AND_SUBJECT_DF["average_height"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["height_ratio"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["width_ratio"]

## Converting Pixels to cm

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_original"][0].shape

- Converting the X-dimension

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_original"]

In [None]:
# copy is required so that we don't accidently over write the same item in the list
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: rescale_dimension_in_array(x["full-recording_subject_location_all-frames_original"].copy(), dimension=0, ratio=x["width_ratio"]), axis=1)

VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_agent_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: rescale_dimension_in_array(x["full-recording_agent_location_all-frames_original"].copy(), dimension=0, ratio=x["width_ratio"]) if x["full-recording_agent_location_all-frames_original"] is not np.nan else np.nan, axis=1)

- Converting the Y-dimension

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: rescale_dimension_in_array(x["full-recording_subject_location_all-frames_rescaled"].copy(), dimension=1, ratio=x["height_ratio"]), axis=1)

VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_agent_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: rescale_dimension_in_array(x["full-recording_agent_location_all-frames_rescaled"].copy(), dimension=1, ratio=x["height_ratio"]) if x["full-recording_agent_location_all-frames_original"] is not np.nan else np.nan, axis=1)

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_original"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"]

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"]

In [None]:
# Normalize dictionary column
normalized = pd.json_normalize(VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_to_coordinate"])

# Drop the original column and concat the normalized DataFrame
VIDEO_TO_FRAME_AND_SUBJECT_DF = pd.concat([VIDEO_TO_FRAME_AND_SUBJECT_DF.drop(["corner_to_coordinate"], axis=1), normalized], axis=1)


In [None]:
for corner in VIDEO_TO_FRAME_AND_SUBJECT_DF["corner_parts"].iloc[0]:
    VIDEO_TO_FRAME_AND_SUBJECT_DF[corner] = VIDEO_TO_FRAME_AND_SUBJECT_DF.apply(lambda x: [x[corner][0]*x["width_ratio"], x[corner][1]*x["height_ratio"]], axis=1)

## Looking over the tracks

In [None]:
FILE_INDEX = 2

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].iloc[FILE_INDEX]

In [None]:
with h5py.File(VIDEO_TO_FRAME_AND_SUBJECT_DF["file_path"].iloc[FILE_INDEX], "r") as f:
    dset_names = list(f.keys())
    location = VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"].iloc[FILE_INDEX]
    node_names = [n.decode() for n in f["node_names"][:]]
    
print("===HDF5 datasets===")
print(dset_names)
print()

print("===location data shape===")
print(location.shape)
print()

print("===nodes===")
for i, name in enumerate(node_names):
    print(f"{i}: {name}")
print()

In [None]:
thorax_loc = location[:, THORAX_INDEX, :]

In [None]:
fig, ax = plt.subplots()

plt.plot(thorax_loc[:,0],label='X-coordinates')
# Converting to negative so that we can see both x and y track
plt.plot(-1*thorax_loc[:,1], label='Y-coordinates')

plt.legend(loc="center right")
plt.title('Thorax location')
plt.xlabel("Time in frames")
plt.ylabel("Coordinate Position")

In [None]:
plt.figure(figsize=(7,7))
plt.plot(thorax_loc[:,0],thorax_loc[:,1])


plt.title('Thorax tracks')
plt.xlabel("X-Coordinates")
plt.ylabel("Y-Coordinates")


In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"].iloc[0].shape

In [None]:
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"].apply(lambda x: fill_missing(x))
VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_agent_location_all-frames_rescaled"] = VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_agent_location_all-frames_rescaled"].apply(lambda x: fill_missing(x) if x is not np.nan else np.nan)

In [None]:
location = VIDEO_TO_FRAME_AND_SUBJECT_DF["full-recording_subject_location_all-frames_rescaled"].iloc[FILE_INDEX]


In [None]:
thorax_loc = location[:, THORAX_INDEX, :]

In [None]:
fig, ax = plt.subplots()

plt.plot(thorax_loc[:,0],label='X-coordinates')
# Converting to negative so that we can see both x and y track
plt.plot(-1*thorax_loc[:,1], label='Y-coordinates')

plt.legend(loc="center right")
plt.title('Thorax location')
plt.xlabel("Time in frames")
plt.ylabel("Coordinate Position")

In [None]:
plt.figure(figsize=(7,7))
plt.plot(thorax_loc[:,0],thorax_loc[:,1])


plt.title('Thorax tracks')
plt.xlabel("X-Coordinates")
plt.ylabel("Y-Coordinates")


# OLD Code below

In [None]:
raise ValueError()

- Filtering for all trials that we got the LFP for

In [None]:
all_trials_df = all_trials_df[all_trials_df["recording_file"].isin(recording_name_to_all_ch_lfp.keys())].reset_index(drop=True)

In [None]:
all_trials_df.head()

- Adding trial numbers based on timestamp ordering for each recording

In [None]:
all_trials_df = all_trials_df.groupby('recording_file').apply(lambda g: compute_sorted_index(g, value_column='time', index_column='trial_number')).reset_index(drop=True)

In [None]:
all_trials_df["trial_number"] = all_trials_df["trial_number"] + 1

## Adding the LFP trace information

In [None]:
channel_map_and_all_trials_df = all_trials_df.merge(CHANNEL_MAPPING_DF, left_on="current_subject", right_on="Subject", how="left")

In [None]:
channel_map_and_all_trials_df = channel_map_and_all_trials_df.drop(columns=[col for col in channel_map_and_all_trials_df.columns if "eib" in col], errors="ignore")

In [None]:
channel_map_and_all_trials_df = channel_map_and_all_trials_df.drop(columns=["Subject"], errors="ignore")

In [None]:
channel_map_and_all_trials_df.head()

In [None]:
channel_map_and_all_trials_df.to_csv("./proc/trial_metadata.csv")

In [None]:
channel_map_and_all_trials_df.to_pickle("./proc/trial_metadata.pkl")

In [None]:
channel_map_and_all_trials_df.columns

- Linking up all LFP calculations with all the trials

In [None]:
channel_map_and_all_trials_df["all_ch_lfp"] = channel_map_and_all_trials_df["recording_file"].map(recording_name_to_all_ch_lfp)

- Creating a new row for each brain region

In [None]:
brain_region_col = [col for col in CHANNEL_MAPPING_DF if "spike_interface" in col]

In [None]:
id_cols = [col for col in channel_map_and_all_trials_df.columns if col not in brain_region_col]

In [None]:
brain_region_col

In [None]:
for col in brain_region_col:
    channel_map_and_all_trials_df[col] = channel_map_and_all_trials_df[col].astype(int).astype(str)

In [None]:
channel_map_and_all_trials_df.columns

In [None]:
for col in brain_region_col:
    print(col)
    channel_map_and_all_trials_df["{}_baseline_lfp_trace".format(col.strip("spike_interface").strip("_"))] = channel_map_and_all_trials_df.apply(lambda row: row["all_ch_lfp"].get_traces(channel_ids=[row[col]], start_frame=row["baseline_lfp_timestamp_range"][0], end_frame=row["baseline_lfp_timestamp_range"][1]).T[0], axis=1)

    channel_map_and_all_trials_df["{}_trial_lfp_trace".format(col.strip("spike_interface").strip("_"))] = channel_map_and_all_trials_df.apply(lambda row: row["all_ch_lfp"].get_traces(channel_ids=[row[col]], start_frame=row["trial_lfp_timestamp_range"][0], end_frame=row["trial_lfp_timestamp_range"][1]).T[0], axis=1)


In [None]:
channel_map_and_all_trials_df = channel_map_and_all_trials_df.drop(columns=["all_ch_lfp"], errors="ignore")

In [None]:
channel_map_and_all_trials_df.to_pickle("./proc/full_baseline_and_trial_lfp_traces.pkl")

In [None]:
channel_map_and_all_trials_df.head()

In [None]:
channel_map_and_all_trials_df.columns

In [None]:
(83177118-3478533)/20000

# LOOP 2: Extracting the timestamps for the raw ephys recording

In [None]:
session_to_din_state_df = {}
for session, file_to_data in session_to_dir.items():
    all_recording_din_state_df = []
    for recording_name, subdir_dict in file_to_data.items():
        print(recording_name)
        current_recording_din_state_df = []

        try:
            voltage_timestamp_array = file_to_data[recording_name]["raw"]["timestamps"]["data"]
            for key, value in file_to_data[recording_name]["DIO"].items():
                if "in" in key:
                    print(key)
                    din_state_array = file_to_data[recording_name]["DIO"][key]["data"]
                    current_din_state_df = pd.DataFrame(din_state_array)
                    current_din_state_df["recording_dir"] = session
                    current_din_state_df["recording_file"] = recording_name
                    current_din_state_df["din"] = key
                    current_recording_din_state_df.append(current_din_state_df)
                    if key == TONE_DIN:
                        plt.plot([tup[0] for tup in din_state_array], [tup[1] for tup in din_state_array])
                        plt.xlabel("Timestamp")
                        plt.ylabel("State")
                        plt.title("Din State Change against Timestamps for {} in {}".format(key, recording_name))
                        plt.show()
                        plt.close()
            concatted_per_recording_din_state_df = pd.concat(current_recording_din_state_df).sort_values(by=["recording_file", "din"]).reset_index(drop=True)
            concatted_per_recording_din_state_df["time_stamp_index"] = concatted_per_recording_din_state_df["time"] - voltage_timestamp_array[0][0]
            all_recording_din_state_df.append(concatted_per_recording_din_state_df)
        except Exception as e: 
            print(e)
    concatted_all_recording_din_state_df = pd.concat(all_recording_din_state_df)
    session_to_din_state_df[session] = concatted_all_recording_din_state_df

In [None]:
voltage_timestamp_array.shape

In [None]:
79698586//20000

In [None]:
concatted_per_recording_din_state_df

In [None]:
all_recording_din_state_df

# LOOP 3 Adding the video timestamps

In [None]:
session_to_din_state_df.keys()

In [None]:
session_to_din_with_frames_df = {}
for session_path in all_session_files:   
    try:
        session_basename = os.path.splitext(os.path.basename(session_path))[0]
        print("Current Session: {}".format(session_basename))
        file_to_video_timestamps = {}
        for video_timestamps in glob.glob(os.path.join(session_path, "*cameraHWSync")):
            video_basename = os.path.basename(video_timestamps)
            print("Current Video Name: {}".format(video_basename))
            timestamp_array = trodes.read_exported.read_trodes_extracted_data_file(video_timestamps)["data"]["PosTimestamp"]
            file_to_video_timestamps[video_basename] = timestamp_array
            session_to_din_state_df[session_basename][os.path.basename(video_timestamps)] = session_to_din_state_df[session_basename]["time"].apply(lambda x: find_closest_index(sorted_list=timestamp_array, target=x))        
        
        # Find the maximum length of the arrays in the dictionary
        max_length = max(map(len, file_to_video_timestamps.values()))
        
        # Pad each array with NaN values to make them all the same length
        padded_data = {k: np.pad(v, (0, max_length - len(v)), mode='constant', constant_values=np.nan) for k, v in file_to_video_timestamps.items()}
        
        # Convert the padded data to a dataframe
        session_to_din_with_frames_df[session_basename] = pd.DataFrame(padded_data)
        session_to_din_with_frames_df[session_basename].to_csv(os.path.join(OUTPUT_DIR, "{}.frame_to_timestamps.csv".format(session_basename)))
    except Exception as e: 
        print(e)

session_to_din_state_df['20230612_101430_standard_comp_to_training_D1_subj_1-4_and_1-3'].head()

session_to_din_with_frames_df['20230612_101430_standard_comp_to_training_D1_subj_1-4_and_1-3'].head()

# LOOP 4: Combining the video columns

In [None]:
session_to_tone_stamp_df = {}
for session, timestamps_df in session_to_din_state_df.items():
    current_timestamps_df = timestamps_df[(timestamps_df["din"] == TONE_DIN) & (timestamps_df["state"] == TONE_STATE)].reset_index(drop=True)
    camera_col = [col for col in current_timestamps_df.columns if "cameraHWSync" in col]
    id_col = [col for col in current_timestamps_df.columns if "cameraHWSync" not in col]
    
    current_timestamps_df = current_timestamps_df.melt(id_vars=id_col, value_vars=camera_col, var_name='video_file', value_name='video_frame')
    current_timestamps_df["video_number"] = current_timestamps_df["video_file"].apply(lambda x: x.strip("videoTimeStamps.cameraHWSync").split(".")[-1])
    current_timestamps_df["subject_info"] = current_timestamps_df["recording_file"].apply(lambda x: x.split("subject")[-1].strip("merged").strip("_"))
    current_timestamps_df["condition"] = np.nan
    session_to_tone_stamp_df[session]  = current_timestamps_df

In [None]:
session_to_tone_stamp_df[session]

In [None]:
concatenated_tone_stamp_df = pd.concat(session_to_tone_stamp_df.values()).reset_index(drop=True)

In [None]:
concatenated_tone_stamp_df.head()

In [None]:
concatenated_tone_stamp_df

In [None]:
concatenated_tone_stamp_df.to_csv(os.path.join(OUTPUT_DIR, "{}_tone_timestamp.csv".format(OUTPUT_PREFIX)))

In [None]:
[1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1]

# Reformatting Dataframe

- Dropping all rows that have not been labeled

In [None]:
all_trials_df = TONE_TIMESTAMP_DF.dropna(subset="condition").reset_index(drop=True)

In [None]:
sorted(all_trials_df["recording_dir"].unique())

- Making the video frame number usable

In [None]:
all_trials_df["video_frame"] = all_trials_df["video_frame"].astype(int)

- Getting the name of the video so that we can sync it up with the ephys recording

In [None]:
all_trials_df["video_name"]  = all_trials_df["video_file"].apply(lambda x: x.strip(".videoTimeStamps.cameraHWSync"))

- Getting all subject IDs for a given recording

In [None]:
# using different id extractions for different file formats
all_trials_df["all_subjects"] = all_trials_df["recording_dir"].apply(lambda x: x if "2023" in x else "subj" + "_".join(x.split("_")[-5:]))
all_trials_df["all_subjects"] = all_trials_df["all_subjects"].apply(lambda x: tuple(sorted([num.strip("_").replace("_",".") for num in x.replace("-", "_").split("subj")[-1].strip("_").split("and")])))

In [None]:
all_trials_df["all_subjects"].unique()

In [None]:
all_trials_df["current_subject"] = all_trials_df["subject_info"].apply(lambda x: ".".join(x.replace("-","_").split("_")[:2])).astype(str)

In [None]:
all_trials_df["current_subject"].unique()

- Converting the trial label to win or lose based on who won the trial

In [None]:
all_trials_df["trial_outcome"] = all_trials_df.apply(
    lambda x: "win" if str(x["condition"]).strip() == str(x["current_subject"]) 
             else ("lose" if str(x["condition"]) in x["all_subjects"] 
                   else x["condition"]), axis=1)

In [None]:
all_trials_df["trial_outcome"].unique()

- Adding the competition closeness as a column

In [None]:
competition_closeness_map = {k: "non_comp" if "only" in str(k).lower() else "comp" if type(k) is str else np.nan for k in all_trials_df["competition_closeness"].unique()}

In [None]:
competition_closeness_map

In [None]:
all_trials_df["competition_closeness"] = all_trials_df["competition_closeness"].map(competition_closeness_map)

In [None]:
all_trials_df["competition_closeness"] = all_trials_df.apply(lambda x: "_".join([str(x["trial_outcome"]), str(x["competition_closeness"])]).strip("nan").strip("_"), axis=1)

In [None]:
all_trials_df["competition_closeness"].unique()

- Adding the LFP index

In [None]:
all_trials_df["lfp_index"] = (all_trials_df["time_stamp_index"] // (EPHYS_SAMPLING_RATE/LFP_SAMPLING_RATE)).astype(int)

In [None]:
all_trials_df["time"] = all_trials_df["time"].astype(int)

In [None]:
all_trials_df["time_stamp_index"] = all_trials_df["time_stamp_index"].astype(int)

- Removing unnecessary columns

In [None]:
all_trials_df = all_trials_df.drop(columns=["state", "din", "condition", "Unnamed: 13"], errors="ignore")

In [None]:
all_trials_df.head()

In [None]:
all_trials_df.groupby(["competition_closeness"]).count()

- Making columns of the different timestamps

In [None]:
all_trials_df["baseline_lfp_timestamp_range"] = all_trials_df["lfp_index"].apply(lambda x: (x - TRIAL_DURATION * LFP_SAMPLING_RATE, x))

In [None]:
all_trials_df["trial_lfp_timestamp_range"] = all_trials_df["lfp_index"].apply(lambda x: (x, x + TRIAL_DURATION * LFP_SAMPLING_RATE))

In [None]:
all_trials_df["baseline_ephys_timestamp_range"] = all_trials_df["time_stamp_index"].apply(lambda x: (x - TRIAL_DURATION * EPHYS_SAMPLING_RATE, x))

In [None]:
all_trials_df["trial_ephys_timestamp_range"] = all_trials_df["time_stamp_index"].apply(lambda x: (x, x + TRIAL_DURATION * EPHYS_SAMPLING_RATE))

In [None]:
all_trials_df["baseline_videoframe_range"] = all_trials_df["video_frame"].apply(lambda x: (x - TRIAL_DURATION * FRAME_RATE, x))

In [None]:
all_trials_df["trial_videoframe_range"] = all_trials_df["video_frame"].apply(lambda x: (x, x + TRIAL_DURATION * FRAME_RATE))

# OLD CODE BELOW

In [None]:
raise ValueError()

# Adding the SLEAP data

In [None]:
recording_sessions_df["lfp_timestamps"].iloc[0]

In [None]:
recording_sessions_df["mPFC_lfp_trace"].iloc[0].shape

In [None]:
np.arange(0, 20 * recording_sessions_df["mPFC_lfp_trace"].iloc[0].shape[0] + 1, 20)

In [None]:
recording_sessions_df["voltage_timestamps"].iloc[0]

In [None]:
recording_sessions_df["voltage_timestamps"].iloc[0][0:20 * recording_sessions_df["mPFC_lfp_trace"].iloc[0].shape[0]:20].shape

In [None]:
83177118-3478533

In [None]:
recording_sessions_df["voltage_timestamps"].iloc[0][::20].shape

In [None]:
recording_sessions_df["mPFC_lfp_trace"].iloc[0].shape

In [None]:
rec

In [None]:
raise ValueError()

In [None]:
region

In [None]:
pd.DataFrame.from_dict(recording_name_to_all_ch_lfp)

In [None]:
79698586/20

In [None]:
raise ValueError()