In [None]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.preprocessing import StandardScaler

# HEALTHY
def load_config(config_path=None):
    if config_path is None:
        script_dir = os.getcwd()
        project_root = os.path.abspath(os.path.join(script_dir, '..'))  # navigate two levels up to the project root
        config_path = os.path.join(project_root, 'config.json')
    
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    return config

def load_data(data_path, verbose=True):
    absolute_data_path = os.path.abspath(data_path)
    if verbose:
        print(f"Loading data from {absolute_data_path}")
    data = pd.read_csv(data_path)
    return data

def load_and_process_data(data_path=None, verbose=True):

    if data_path is None:
        config = load_config()
        data_path = config['data_path']

    data = load_data(data_path, verbose=verbose)
    
    # add unique_id column (mouse specific ID)
    # Determine the maximum length of mouseID as a string for each lab
    max_length = data.groupby('lab')['mouseID'].transform(lambda x: x.astype(str).str.len().max())

    # Create unique_id by padding mouseID with leading zeros
    data['unique_id'] = data.apply(lambda x: f"{x['lab']}_{x['mouseID']:0{max_length.loc[x.name]}}", axis=1)

    # Optionally, convert to categorical and encode as you were doing, if needed
    data['unique_id'] = data['unique_id'].astype('category').cat.codes + 1

    return data

data = load_and_process_data()

# save data to csv src/data/data_v2_unique_id.csv
data.to_csv('data_v2_unique_id.csv', index=False)
print("Data saved to data_v2_unique_id.csv")

In [7]:
import numpy as np
import pandas as pd
import os
import json
from sklearn.preprocessing import StandardScaler

# NARCOLEPSY

def load_config(config_path=None):
    if config_path is None:
        script_dir = os.getcwd()
        project_root = os.path.abspath(os.path.join(script_dir, '..'))  # navigate two levels up to the project root
        config_path = os.path.join(project_root, 'config.json')
    
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)
    return config

def load_data(data_path, verbose=True):
    absolute_data_path = os.path.abspath(data_path)
    if verbose:
        print(f"Loading data from {absolute_data_path}")
    data = pd.read_csv(data_path)
    return data

def load_and_process_data(data_path=None, verbose=True):

    if data_path is None:
        config = load_config()
        data_path = config['data_path']

    data = load_data(data_path, verbose=verbose)
    
    # add unique_id column (mouse specific ID)
    # Determine the maximum length of mouseID as a string for each lab
    max_length = data.groupby('lab')['mouseID'].transform(lambda x: x.astype(str).str.len().max())

    # Create unique_id by padding mouseID with leading zeros
    data['unique_id'] = data.apply(lambda x: f"{x['lab']}_{x['mouseID']:0{max_length.loc[x.name]}}", axis=1)

    # Optionally, convert to categorical and encode as you were doing, if needed
    data['unique_id'] = data['unique_id'].astype('category').cat.codes + 151 # add 150 to the unique_id to differentiate from healthy data

    return data

data = load_and_process_data()

# save data to csv src/data/data_v2_unique_id.csv
data.to_csv('narcolepsy_unique_id.csv', index=False)
print("Data saved to data_v2_unique_id_unique_id.csv")

Loading data from /Users/noahryu/Library/CloudStorage/OneDrive-Personal/DTU/Semester 4 - 2024/Project_work/sleepy_mice/data/narcolepsy.csv
Data saved to data_v2_unique_id_unique_id.csv
