In [None]:
#| default_exp config

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import os

import numpy as np
import pandas as pd

In [None]:
#| export
REF_COLOR = "k"
FEMALE_COLOR = "C1"
MALE_COLOR = "C0"
ALL_COLOR = "C5"

GLUC_COLOR = "C0"
FOOD_COLOR = "C1"

DATASETS_PATH = '/home/ec2-user/studies/hpp/'
COHORT = '10k'
POPULATION_DATASET = 'population'
ERROR_ACTION = 'raise'
CONFIG_FILES = ['.pheno/config', '~/.pheno/config', '/efs/.pheno/config']

for cf in CONFIG_FILES:
    cf = os.path.expanduser(cf)
    if not os.path.isfile(cf):
        continue
    with open(cf, 'r') as f:
        for line in f:
            if line.startswith('DATASETS_PATH'):
                DATASETS_PATH = line.split('=')[1].strip()
            elif line.startswith('POPULATION_DATASET'):
                POPULATION_DATASET = line.split('=')[1].strip()
            elif line.startswith('COHORT'):
                COHORT = line.split('=')[1].strip()
                if (len(COHORT) == 0) or (COHORT == 'None'):
                    COHORT = None
            elif line.startswith('ERROR_ACTION'):
                ERROR_ACTION = line.split('=')[1].strip()
    break


In [None]:
#| export

def generate_synthetic_data(n: int = 1000) -> pd.DataFrame:
    """
    Generates a sample DataFrame containing age, gender, and value data.

    Args:
        n: The number of rows in the generated DataFrame.

    Returns:
        A pandas DataFrame with columns 'age', 'gender', and 'val'.
    """
    pids = np.arange(n)
    # Set start and end dates
    start_date = pd.Timestamp('2020-01-01')
    end_date = pd.Timestamp('now')
    dates = pd.to_datetime(pd.to_datetime(np.random.uniform(start_date.value, end_date.value, n).astype(np.int64)).date)  
    ages = np.random.uniform(35, 73, size=n)
    genders = np.random.choice([0, 1], size=n)
    vals = np.random.normal(30 + 1 * ages + 40 * genders, 20, size=n)
    
    data = pd.DataFrame(data={"participant_id":pids,"date_of_research_stage": dates,"age_at_research_stage": ages, "sex": genders, "val1": vals}).set_index("participant_id")
    data["val2"] = data["val1"]*0.3 + 0.5*np.random.normal(0,50) + 0.2*10*data["sex"]
    return data

In [None]:
#| export

def generate_synthetic_data_like(df: pd.DataFrame, n: int = 1000, random_seed: int = 42) -> pd.DataFrame:
    """
    Generate a sample DataFrame containing the same columns as `df`, but with random data.

    Args:
    
        df: The DataFrame whose columns should be used.
        n: The number of rows in the generated DataFrame.

    Returns:
        A pandas DataFrame with the same columns as `df`.
    """
    np.random.seed(random_seed)
    pids = np.arange(n)
    if n > len(df):
        replace = True
    else:
        replace = False

    null = df.reset_index().apply(lambda x: x.sample(frac=1).values)\
        .sample(n=n, replace=replace).assign(participant_id=pids)\
        .set_index(df.index.names)

    def is_path_string(x):
        return isinstance(x, str) and (x.count('/') > 1)

    # handle specific columns
    null.loc[:, null.applymap(is_path_string).mean() > 0.5] = '/path/to/file'
    if ('collection_timestamp' in null.columns) and ('collection_date' in null.columns):
        null['collection_date'] = null['collection_timestamp'].dt.date

    return null

In [None]:
data = generate_synthetic_data()
data.head()

Unnamed: 0_level_0,date_of_research_stage,age_at_research_stage,sex,val1,val2
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2020-04-11,71.21653,1,122.487606,27.996005
1,2023-01-06,66.211243,1,141.050268,33.564803
2,2022-03-13,38.190023,1,86.009637,17.052614
3,2023-04-07,65.760406,1,135.596689,31.92873
4,2022-05-08,66.785952,1,140.012493,33.253471


In [None]:
generate_synthetic_data_like(data.head(), n=5)

Unnamed: 0_level_0,date_of_research_stage,age_at_research_stage,sex,val1,val2
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2022-03-13,65.760406,1,135.596689,27.996005
1,2023-04-07,66.211243,1,122.487606,33.253471
2,2022-05-08,38.190023,1,140.012493,33.564803
3,2023-01-06,71.21653,1,86.009637,17.052614
4,2020-04-11,66.785952,1,141.050268,31.92873


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()