In [None]:
#| default_exp config

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os
import json
import numpy as np
import pandas as pd
import shutil

In [None]:
#| export
REF_COLOR = "k"
FEMALE_COLOR = "C1"
MALE_COLOR = "C0"
ALL_COLOR = "C5"

GLUC_COLOR = "C0"
FOOD_COLOR = "C1"

DATASETS_PATH = '/home/ec2-user/studies/hpp/'
COHORT = None
EVENTS_DATASET = 'events'
ERROR_ACTION = 'raise'
CONFIG_FILES = ['.pheno/config.json', '~/.pheno/config.json', '/efs/.pheno/config.json']
BULK_DATA_PATH = {}

config_found = False



In [None]:
#| export

def copy_tre_config():
    tre_mode = False
    script_path = os.path.dirname(os.path.abspath(__file__))
    absolute_config_path = os.path.join(script_path, 'config_setup/config_tre.json')
    
    with open(absolute_config_path, 'r') as openfile:
        json_object = json.load(openfile)
                
    datasets_full_path = json_object['DATASETS_PATH']
    if os.path.exists(datasets_full_path):
        print("TRE Mode")
        tre_mode = True
        if not os.path.exists(os.path.expanduser('~/.pheno')):
            os.makedirs(os.path.expanduser('~/.pheno'))
        
        shutil.copy2(absolute_config_path, os.path.expanduser('~/.pheno/config.json'))
    
    return tre_mode


In [None]:
#| export

for cf in CONFIG_FILES:
    cf = os.path.expanduser(cf)
    if not os.path.isfile(cf):
        continue
    
    config_found=True
    
    f = open(cf)
    config = json.load(f)
    
    if 'DATASETS_PATH' in config:
        DATASETS_PATH = config['DATASETS_PATH']
    if 'BULK_DATA_PATH' in config:
        BULK_DATA_PATH = config['BULK_DATA_PATH']
    if 'EVENTS_DATASET' in config:
        EVENTS_DATASET = config['EVENTS_DATASET']
    if 'COHORT' in config:
        if config['COHORT'] == 0 or config['COHORT']=='None' or config['COHORT']==None :
            COHORT = None
    if 'ERROR_ACTION' in config:
        ERROR_ACTION = config['ERROR_ACTION']
    break

if not config_found: 
    if not copy_tre_config():
        raise ValueError(f'Missing Config file, please read the README file and run config_setup/create_default_config.py')
        
    
    

In [None]:
#| export

def generate_synthetic_data(n: int = 1000) -> pd.DataFrame:
    """
    Generates a sample DataFrame containing age, gender, and value data.

    Args:
        n: The number of rows in the generated DataFrame.

    Returns:
        A pandas DataFrame with columns 'age', 'gender', and 'val'.
    """
    pids = np.arange(n)
    # Set start and end dates
    start_date = pd.Timestamp('2020-01-01')
    end_date = pd.Timestamp('now')
    dates = pd.to_datetime(pd.to_datetime(np.random.uniform(start_date.value, end_date.value, n).astype(np.int64)).date)  
    ages = np.random.uniform(35, 73, size=n)
    genders = np.random.choice([0, 1], size=n)
    vals = np.random.normal(30 + 1 * ages + 40 * genders, 20, size=n)
    
    data = pd.DataFrame(data={"participant_id":pids,"date_of_research_stage": dates,"age_at_research_stage": ages, "sex": genders, "val1": vals}).set_index("participant_id")
    data["val2"] = data["val1"]*0.3 + 0.5*np.random.normal(0,50) + 0.2*10*data["sex"]
    return data

In [None]:
#| export

def generate_synthetic_data_like(df: pd.DataFrame, n: int = 1000, random_seed: int = 42) -> pd.DataFrame:
    """
    Generate a sample DataFrame containing the same columns as `df`, but with random data.

    Args:
    
        df: The DataFrame whose columns should be used.
        n: The number of rows in the generated DataFrame.

    Returns:
        A pandas DataFrame with the same columns as `df`.
    """
    np.random.seed(random_seed)
    pids = np.arange(n)
    if n > len(df):
        replace = True
    else:
        replace = False

    null = df.reset_index().apply(lambda x: x.sample(frac=1).values)\
        .sample(n=n, replace=replace).assign(participant_id=pids)\
        .set_index(df.index.names)

    def is_path_string(x):
        return isinstance(x, str) and (x.count('/') > 1)

    # handle specific columns
    null.loc[:, null.applymap(is_path_string).mean() > 0.5] = '/path/to/file'
    if ('collection_timestamp' in null.columns) and ('collection_date' in null.columns):
        null['collection_date'] = null['collection_timestamp'].dt.date

    return null

In [None]:
data = generate_synthetic_data()
data.head()

Unnamed: 0_level_0,date_of_research_stage,age_at_research_stage,sex,val1,val2
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2023-07-19,46.80722,1,104.696121,53.943414
1,2023-05-26,54.077442,0,78.191494,43.992026
2,2023-04-04,37.621566,1,98.444702,52.067989
3,2023-04-02,61.779274,0,75.206789,43.096615
4,2021-05-28,43.872331,1,140.198745,64.594202


In [None]:
generate_synthetic_data_like(data.head(), n=5)

Unnamed: 0_level_0,date_of_research_stage,age_at_research_stage,sex,val1,val2
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2023-04-04,61.779274,1,75.206789,53.943414
1,2023-04-02,54.077442,1,104.696121,64.594202
2,2021-05-28,37.621566,1,140.198745,43.992026
3,2023-05-26,46.80722,0,98.444702,52.067989
4,2023-07-19,43.872331,0,78.191494,43.096615


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()