# Preprocessing .mat for NWB

## Function

In [1]:
import numpy as np
import h5py
import os
import yaml
import pandas as pd
from datetime import datetime
from NWB_conversion import (convert_data_to_nwb_an)
from utils.server_paths import (get_nwb_folder, get_subject_analysis_folder, get_experimenter_analysis_folder,
                                get_subject_data_folder, get_dlc_file_path, get_facemap_file_path, EXPERIMENTER_MAP)


In [2]:


def search_and_open_mat(mouse_id: str, last_done_day: str):
    """
    Search and open a .mat file from a mounted network drive (/Volumes/WR on macOS).

    :param mouse_id: Mouse ID to search  e.g., "AO039"
    :param last_done_day: Last done day as a string e.g., "20190626"
    :return: Path to the found .mat file
    """

    # Build the full path to the .mat file
    filename = f"{mouse_id}_{last_done_day}.mat"
    file_path = os.path.join("/Volumes/WR", filename)

    # Check if the file exists
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    # Try opening the file using h5py (for .mat v7.3 HDF5 format)
    try:
        with h5py.File(file_path, 'r') as f:
            print(f"✅ File '{filename}' opened successfully.")

    except PermissionError as e:
        print("❌ Permission denied: Python can't access this file.")
        print("💡 Check macOS privacy settings (System Preferences > Privacy > Full Disk Access).")
        raise e

    except OSError as e:
        print("❌ Error opening file. Is it an HDF5 (MATLAB v7.3) .mat file?")
        raise e
    
    return file_path



In [4]:


related_publications = 'Oryshchuk A, Sourmpis C, Weverbergh J, Asri R, Esmaeili V, Modirshanechi A, Gerstner W, Petersen CCH, Crochet S. Distributed and specific encoding of sensory, motor, and decision information in the mouse neocortex during goal-directed behavior. Cell Rep. 2024 Jan 23;43(1):113618. doi: 10.1016/j.celrep.2023.113618. Epub 2023 Dec 26. PMID: 38150365.'
csv_file = "data/Subject_Session_Selection.csv"


def files_to_config(mat_file, output_folder="data"):
    """
    Converts a .mat file and csv_file into a .yaml configuration file for the NWB pipeline.

    :param mat_file: Path to the .mat file
    :return: Configuration dictionary + path to the yaml file
    """
    # Load the .mat file 
    with h5py.File(mat_file, 'r') as f:
        data_group = f['Data'] if 'Data' in f else f
        data = {key: data_group[key][()] for key in data_group.keys()}

    mouse = ''.join(chr(c) for c in data['mouse'].flatten())
    date = ''.join(chr(c) for c in data['date'].flatten())
    session_name = f"{mouse}_{date}"  # e.g., "AO039_20190626"

    # Load the CSV file 
    csv_data = pd.read_csv(csv_file, sep=";")
    csv_data.columns = csv_data.columns.str.strip() 

    try:
        subject_info = csv_data[csv_data['Session'].astype(str).str.strip() == session_name].iloc[0]
    except IndexError:
        raise ValueError(f"Session {session_name} not found in the CSV file.")

    # Extract information 

    ## Session metadata extraction
    ### Experiment_description
    date = ''.join(chr(c) for c in data['date'].flatten())
    date_experience = pd.to_datetime(date, format='%Y%m%d')


    ref_weight = subject_info.get("Weight of Reference", "")
    if pd.isna(ref_weight) or str(ref_weight).strip().lower() in ["", "nan"]:
        ref_weight = "Unknown"
    else:
        try:
            ref_weight = float(ref_weight)
        except Exception:
            ref_weight = "Unknown"  

    video_sr = int(data["Video_sr"])
    if pd.isna(video_sr) or str(video_sr).strip().lower() in ["", "nan"]:
        video_sr = 200
    else:
        video_sr = int(data["Video_sr"])

    experiment_description = {
    'reference_weight': ref_weight,
    #'wh_reward': 1,
    #'aud_reward': 1,
    #'reward_proba': 1,
    #'lick_threshold': 0.08,
    #'no_stim_weight': 8,
    #'wh_stim_weight': 10,
    #'aud_stim_weight': 2,
    'camera_flag': 1,
    'camera_freq': video_sr,
    'camera_exposure_time': 3,
    #'camera_start_delay': 3,
    #'artifact_window': 100,
    'licence': str(subject_info.get("licence", "")).strip(),
    'ear tag': str(subject_info.get("Ear tag", "")).strip(),
}
    ### Experimenter
    experimenter = EXPERIMENTER_MAP.get(mouse[:2], 'Inconnu')
   
    ### Session_id, identifier, institution, keywords
    session_id = subject_info["Session"].strip() 
    identifier = session_id + "_" + str(subject_info["Start Time (hhmmss)"])
    keywords = ["neurophysiology", "behaviour", "mouse", "electrophysiology"] #DEMANDER SI BESOIN DE CA

    ### Session start time
    session_start_time = str(subject_info["Session Date (yyymmdd)"])+" " + str(subject_info["Start Time (hhmmss)"])

    ## Subject metadata extraction

    ### Birth date and age calculation
    birth_date = pd.to_datetime(subject_info["Birth date"], dayfirst=True)
    age = subject_info["Mouse Age (d)"]
    age = f"P{age}D"


    ### Genotype 
    genotype = subject_info.get("mutations", "")
    if pd.isna(genotype) or str(genotype).strip().lower() in ["", "nan"]:
        genotype = "WT"
    genotype = str(genotype).strip()


    ### weight
    weight = subject_info.get("Weight Session", "")
    if pd.isna(weight) or str(weight).strip().lower() in ["", "nan"]:
        weight = "Unknown"
    else:
        try:
            weight = float(weight)
        except Exception:
            weight = "Unknown" 

    ##Behavioral metadata extraction

    camera_exposure_time = 3
    camera_flag = 1





    # Construct the output YAML path
    config = {
        'session_metadata': {
            'experiment_description' : experiment_description,
            'experimenter': experimenter,
            'identifier': identifier,
            'institution': "Ecole Polytechnique Federale de Lausanne",
            'keywords': keywords,
            'lab' : "Laboratory of Sensory Processing",
            'notes': 'na',
            'pharmacology': 'na',
            'protocol': 'na',
            'related_publications': related_publications,
            'session_description': "ephys" +" " + str(subject_info.get("Session Type", "Unknown").strip()),
            'session_id': session_id,
            'session_start_time': session_start_time,
            'slices': "na", 
            'source_script': 'na',
            'source_script_file_name': 'na',
            'stimulus_notes': 'na',
            'surgery': 'na',
            'virus': 'na',

        },
        'subject_metadata': {
            'age': age,
            'age__reference': 'birth',
            'date_of_birth': birth_date.strftime('%m/%d/%Y') if birth_date else None,
            'description': mouse,
            'genotype': genotype,
            'sex': subject_info.get("Sex_bin", "").upper().strip(),
            'species': "Mus musculus",
            'strain': subject_info.get("strain", "").strip(),
            'subject_id': mouse,
            'weight': weight,

        },
        'behavioral_metadata': {
            #'behaviour_type': 'whisker',
            'camera_exposure_time': camera_exposure_time,
            'camera_flag': camera_flag,
            #'path_to_config_file': 'path',
            #'setup': '',
            #'trial_table': 'standard'
        },
    
        'ephys_metadata': {
            'processed': 1
        }
    }

    # save config
    output_path = os.path.join(output_folder, f"{session_name}_config.yaml")
    with open(output_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"✅ Config YAML sauvegardé à : {output_path}")
    
    return output_path, config




# VALIDATION

In [12]:
import scipy.io
import h5py

filename = "data/AO039_20190626.mat"  

with h5py.File(filename, 'r') as f:
    data_group = f['Data']
    data_ref = f["#refs#"]
    #print("Références trouvées :")
    #for key in data_ref.keys():
    #    print(f" - {key} → shape: {data_ref[key].shape}, dtype: {data_ref[key].dtype}")
    print("Contenu de 'Data' :")
    for key in data_group.keys():
        print(f" - {key} → shape: {data_group[key].shape}, dtype: {data_group[key].dtype}")


Contenu de 'Data' :
 - ARAindex → shape: (75, 1), dtype: float64
 - Area → shape: (75, 1), dtype: object
 - BaselineFR_Mean → shape: (75, 1), dtype: float64
 - BaselineFR_Session → shape: (75, 1), dtype: object
 - CRIndices → shape: (1, 583), dtype: uint8
 - ClusterCounter → shape: (75, 1), dtype: float64
 - EngagedTrials → shape: (583, 1), dtype: uint8
 - FAIndices → shape: (1, 583), dtype: uint8
 - HitIndices → shape: (1, 583), dtype: uint8
 - ISI_Violation → shape: (75, 1), dtype: float64
 - ISO_Distance → shape: (75, 1), dtype: float64
 - JawOnsetsTms → shape: (583, 1), dtype: float64
 - JawTrace → shape: (583, 1000), dtype: float64
 - LFPs → shape: (3, 1), dtype: object
 - LickData → shape: (482142, 1), dtype: float64
 - LickTime → shape: (482142, 1), dtype: float64
 - LightIndices → shape: (1, 583), dtype: float64
 - MDS → shape: (1, 1), dtype: float64
 - ML_DV_AP → shape: (75, 1), dtype: object
 - ML_DV_AP_32 → shape: (75, 1), dtype: object
 - MissIndices → shape: (1, 583), dtyp

In [177]:
with h5py.File(filename, 'r') as f:
	data_group = f['Data'] if 'Data' in f else f
	data = {key: data_group[key][()] for key in data_group.keys()}


# CSV


In [178]:

csv_data = pd.read_csv("data/Subject_Session_Selection.csv", sep=";")
csv_data.columns = csv_data.columns.str.strip() 
csv_data.columns

mouse = ''.join(chr(c) for c in data['mouse'].flatten())
date = ''.join(chr(c) for c in data['date'].flatten())
session_name = f"{mouse}_{date}"  

try:
    subject_info = csv_data[csv_data['Session'].astype(str).str.strip() == session_name].iloc[0]
except IndexError:
    raise ValueError(f"Session {session_name} not found in the CSV file.")

# TOTAL

In [5]:
filename = "data/AO039_20190626.mat" 
config_path, config = files_to_config(filename , output_folder="data")


✅ Config YAML sauvegardé à : data/AO039_20190626_config.yaml


In [5]:
config_path = "data/AO039_20190626_config.yaml"

convert_data_to_nwb_an(
    config_file=config_path,
    output_folder="data/nwb_output",
    with_time_string=False
)

*********************
Start NWB conversion
 
Open NWB file and add metadata
Subject
Session
 
Saving NWB file
NWB file created at : data/nwb_output/AO039_20190626_160524.nwb


In [6]:
from pynwb import NWBHDF5IO

with NWBHDF5IO('data/nwb_output/AO039_20190626_160524.nwb', 'r') as io:
    nwbfile = io.read()
    print(nwbfile)  # Affiche un résumé du contenu

root pynwb.file.NWBFile at 0x5063735808
Fields:
  experiment_description: {'camera_exposure_time': 3, 'camera_flag': 1, 'camera_freq': 500, 'ear tag': 'CPE-J34952', 'licence': '1628.7', 'reference_weight': 26.2}
  experimenter: ['Anastasiia Oryshchuk']
  file_create_date: [datetime.datetime(2025, 7, 9, 17, 21, 36, 115147, tzinfo=tzoffset(None, 7200))]
  identifier: AO039_20190626_160524
  institution: Ecole Polytechnique Federale de Lausanne
  keywords: <StrDataset for HDF5 dataset "keywords": shape (4,), type "|O">
  lab: Laboratory of Sensory Processing
  notes: na
  pharmacology: na
  protocol: na
  related_publications: ['Oryshchuk A, Sourmpis C, Weverbergh J, Asri R, Esmaeili V, Modirshanechi A, Gerstner W, Petersen CCH, Crochet S. Distributed and specific encoding of sensory, motor, and decision information in the mouse neocortex during goal-directed behavior. Cell Rep. 2024 Jan 23;43(1):113618. doi: 10.1016/j.celrep.2023.113618. Epub 2023 Dec 26. PMID: 38150365.']
  session_desc

In [7]:
print(nwbfile.subject)                   # Infos sur l’animal
print(nwbfile.acquisition)              # Données enregistrées
print(nwbfile.processing)               # Traitements
print(nwbfile.session_start_time)       # Métadonnées principales


subject pynwb.file.Subject at 0x5062867024
Fields:
  age: P89D
  age__reference: birth
  date_of_birth: 2019-03-29 00:00:00+01:00
  description: AO039
  genotype: WT
  sex: M
  species: Mus musculus
  strain: C57BL/6
  subject_id: AO039
  weight: 23.1

{}
{}
2019-06-26 16:05:24+02:00


In [9]:
from pynwb import NWBHDF5IO
from pynwb.validate import validate

# Ouvre le fichier en lecture
with NWBHDF5IO('data/nwb_output/AO039_20190626_160524.nwb', 'r') as io:
    errors = validate(io)

# Affichage des résultats
if not errors:
    print("✅ Fichier NWB valide !")
else:
    print("❌ Erreurs détectées :")
    for err in errors:
        print("-", err)


✅ Fichier NWB valide !
