# Data Preprocessing Notebook

### Imports and helper functions

In [None]:
import numpy as np
from scipy import signal
from tqdm import tqdm
import matplotlib.pyplot as plt
from core import load_ephys
import os

### Defining the data directory and parameters

In [None]:
data_dir = r"E:\clickbait-ephys\data"
save_dir = r"E:\clickbait-ephys\data\preprocessed"
mice = ['6000']
run_sessions = [14]
session_offset = 3
gain = 0.1949999928474426

os.makedirs(save_dir, exist_ok=True)
open_ephys_dir = os.path.join(data_dir, "open-ephys")

### 📊 Extracting OpenEphys Data (Sniff, Ephys, TTLs) and Restructuring the Data

The script processes raw OpenEphys data, extracting relevant signals (LFP, MUA, sniff, TTLs) and saving them in a structured format. The preprocessed data will be stored in the `preprocessed_data` directory with the following structure:


```
preprocessed_data
├── <animal_id>
│   ├── <session_id>
│   │   ├── lfp.npy
│   │   ├── mua.npy
│   │   ├── sniff.npy
│   │   └── ttls.npy
```



#### 🗂️ **Data Descriptions:**

- **`lfp.npy`** *(Local Field Potential)*  
  - **Description:** Represents the slow oscillatory activity of the brain.  
  - **Processing:** Low-pass filtered at **300 Hz** to capture relevant neural dynamics.

- **`mua.npy`** *(Multi-Unit Activity)*  
  - **Description:** Captures high-frequency spiking activity from multiple neurons.  
  - **Processing:** Band-pass filtered between **300 Hz and 6 kHz** and **median common average referenced** within regions to reduce noise.

- **`sniff.npy`** *(Sniff Data)*  
  - **Description:** Raw sniffing signal representing respiratory behavior.  
  - **Processing:** Extracted directly from the raw data without additional filtering.

- **`ttls.npy`** *(TTL Pulses)*  
  - **Description:** Raw TTL (Transistor-Transistor Logic) pulses used as event markers.  
  - **Processing:** Saved in raw format to preserve precise event timing.

---





In [None]:
# List all files in the open-ephys directory
all_files = os.listdir(open_ephys_dir)

# Loop through each mouse
for mouse in mice:
    # Filter files related to the current mouse
    mouse_sessions = [f.strip() for f in all_files if mouse in f]

    # Process each session for the current mouse
    for session in tqdm(mouse_sessions):
        print(f"Processing session: {session}")

        # Extract date and session number
        date = session[:10]
        try:
            session_num = int(date[-2:]) - session_offset
        except ValueError:
            print(f"Invalid date format in session: {session}")
            continue

        if session_num not in run_sessions:
            print(f"Skipping session {session_num}")
            continue
        
        # Build the path to the data directory
        session_path = os.path.join(open_ephys_dir, session)

        # Check if the session directory exists
        if not os.path.isdir(session_path):
            print(f"Session directory not found: {session_path}")
            continue

        # Get the first subdirectory (assumed to be the "Record Node" folder)
        try:
            record_node = os.listdir(session_path)[0]
        except IndexError:
            print(f"No files found in session: {session_path}")
            continue

        # Build the path to the continuous data directory
        continuous_dir = os.path.join(session_path, record_node, 'experiment2', 'recording1', 'continuous')

        # Check if the continuous directory exists
        if not os.path.isdir(continuous_dir):
            print(f"Continuous directory not found: {continuous_dir}")
            continue

        # Get the acquisition board folder
        try:
            acquisition_board = os.listdir(continuous_dir)[0]
        except IndexError:
            print(f"No acquisition board folder found in: {continuous_dir}")
            continue

        # Final path to the continuous.dat file
        data_path = os.path.join(continuous_dir, acquisition_board, 'continuous.dat')
        print(f"Data path: {data_path}")

        # Load the electrophysiological data
        if os.path.exists(data_path):
            try:

                # Making the data directory
                data_dir = os.path.join(save_dir, mouse, str(session_num))
                os.makedirs(data_dir, exist_ok=True)

                # Ensuring files do not already exist
                if os.path.exists(os.path.join(data_dir, 'lfp.npy')) or \
                   os.path.exists(os.path.join(data_dir, 'mua.npy')) or \
                   os.path.exists(os.path.join(data_dir, 'sniff.npy')) or \
                   os.path.exists(os.path.join(data_dir, 'ttls.npy')):
                    print(f"Files already exist for {session}, skipping...")
                    continue

                # Loading the data
                data = load_ephys(data_path, nchannels=40, dtype=np.int16, order='F')


                # Extracting the ephys, sniff, and ttl timeseries
                ephys = (data[:32, :] * gain).astype(np.float64)
                sniff = data[-1, :]
                ttls = data[-2, :]

                # lowpass filtering the ephys data to get the LFP
                sos = signal.butter(4, 300, 'lowpass', fs=30000, output='sos')
                lfps = signal.sosfiltfilt(sos, ephys)

                # bandpass filtering the ephys data to get the MUA
                sos = signal.butter(4, [300, 6000], 'bandpass', fs=30000, output='sos')
                mua = signal.sosfiltfilt(sos, ephys)

                # median common average referencing the multiunit activity
                mua[:16, :] -= np.median(mua[:16, :], axis=0)
                mua[16:, :] -= np.median(mua[16:, :], axis=0)

                # converting back to int16
                lfps = lfps.astype(np.int16)
                mua = mua.astype(np.int16)
                sniff = sniff.astype(np.int16)
                ttls = ttls.astype(np.int16)

                

                # Saving the data as binary files
                np.save(os.path.join(data_dir, 'lfp.npy'), lfps)
                np.save(os.path.join(data_dir, 'mua.npy'), mua)
                np.save(os.path.join(data_dir, 'sniff.npy'), sniff)
                np.save(os.path.join(data_dir, 'ttls.npy'), ttls)

                print(f"Loaded data successfully for {session}\n")
            except Exception as e:
                print(f"Error loading data from {data_path}: {e}")
        else:
            print(f"Data path not found: {data_path}")

In [None]:
run_sessions = ['14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']
mice = ['6001', '6002', '6003']

# loop through the preprocessed data and donwsample the lfp files to 1Khz
for mouse in mice:
    mouse_dir = os.path.join(save_dir, mouse)
    for session in tqdm(os.listdir(mouse_dir), desc=f"Processing {mouse}"):

        if session not in run_sessions:
            print(f"Skipping session {session}")
            continue

        session_dir = os.path.join(mouse_dir, session)
        lfp_path = os.path.join(session_dir, 'lfp.npy')
        if os.path.exists(lfp_path):
            print(f"Downsampling LFP for {session}")
            lfp = np.load(lfp_path)

            # lowpass filter the lfp to 300Hz
            sos = signal.butter(4, 300, 'lowpass', fs=1000, output='sos')
            lfp = signal.sosfiltfilt(sos, lfp)
            
            # downsample the lfp to 1kHz
            lfp = signal.decimate(lfp, 30, axis=1)
            np.save(lfp_path, lfp)
        else:
            print(f"LFP file not found for {session}")

### helping kilosort amandas data

In [None]:
# concatenate binary files 
files = [r"C:\Users\smearlab\clickbait-ephys\DOI\5001\050924_5001_DOI 1_Ephys.bin"]

concatenated_data = []
for file in files:
    print(f"Loading {file}...")
    data = load_ephys(file, nchannels=32, dtype=np.uint16, order='F') # Notice the data is stored as uint16

    # bandpass filtering the ephys data to get the MUA
    print(f"Filtering {file}")
    sos = signal.butter(4, [300, 6000], 'bandpass', fs=30000, output='sos')
    data = signal.sosfiltfilt(sos, data)

    # median common average referencing the multiunit activity
    print(f"Median referencing {file}")
    data -= np.median(data, axis=0)

    # convert back to int16
    data = data.astype(np.int16)

    print('plotting')
    plt.figure(figsize=(15, 7))
    plt.plot(data[0, :30_000])
    plt.show()
    concatenated_data.append(data)
    del data
print('concatenating')
concatenated_data = np.concatenate(concatenated_data, axis=1)



# reshape the concatenated data for kilosort
print('reshaping')
concatenated_data = concatenated_data.reshape(-1, order = 'F')

# save concatenated data to a new file
print('saving')
output_file = r"C:\Users\smearlab\clickbait-ephys\DOI\5001\concatenated_data_bandpass.bin"
with open(output_file, 'wb') as f:
    concatenated_data.tofile(f)
    


