In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
df = pd.read_csv('data/3-9-48-72months_383CpGs_153indivs_age_related.csv')

In [3]:
def convert_df_to_dict(df, unique_ids, sorted_probeids=None, sorted_months=None):
    filtered_df = df[df['ID'].isin(unique_ids)]

    # Get unique ProbeIDs and timepoints
    sorted_probeids = sorted_probeids if sorted_probeids is not None else np.sort(df['ProbeID'].unique())
    sorted_months = sorted_months if sorted_months is not None else np.sort(df['Months'].unique())

    # Create a dictionary to store the data
    data = {}

    # Create a dictionary to map ProbeID and Months to their indices
    probeid_to_idx = {probeid: idx for idx, probeid in enumerate(sorted_probeids)}
    month_to_idx = {month: idx for idx, month in enumerate(sorted_months)}

    # Initialize arrays in the dictionary for each ID
    for id_val in unique_ids:
        data[id_val] = {}
        data[id_val]['data'] = np.zeros((len(sorted_probeids), len(sorted_months)), dtype=np.float32)
        data[id_val]['mask'] = np.zeros((len(sorted_probeids), len(sorted_months)), dtype=int)
        data[id_val]['time'] = np.zeros((len(sorted_months), 1), dtype=np.float32)

    # Fill in the array with Output values
    for _, row in filtered_df.iterrows():
        id_val, probe_id, month, inputs, output = row['ID'], row['ProbeID'], row['Months'], row['Input'], row['Output']
        
        # Get the index for ProbeID and Months
        probe_idx = probeid_to_idx[probe_id]
        month_idx = month_to_idx[month]

        if np.isnan(output):
            continue

        # Assign the output value to the correct position
        data[id_val]['data'][probe_idx, month_idx] = output
        data[id_val]['mask'][probe_idx, month_idx] = 1
        if data[id_val]['time'][month_idx] == 0:
            data[id_val]['time'][month_idx] = inputs

    return {'data': data, 'probeids': sorted_probeids, 'months': sorted_months}

In [4]:
# create an rng state
rng = np.random.default_rng(42)
# Get the unique IDs within the df
unique_subj_ids = df['ID'].unique()
unique_probe_ids = df['ProbeID'].unique()
unique_times = df['Months'].unique()
# Shuffle the unique IDs
rng.shuffle(unique_subj_ids)
# Split into train (80%), validation (10%), and test (10%) sets
train_ids = unique_subj_ids[:int(0.8*len(unique_subj_ids))]
val_ids = unique_subj_ids[int(0.8*len(unique_subj_ids)):int(0.9*len(unique_subj_ids))]
test_ids = unique_subj_ids[int(0.9*len(unique_subj_ids)):]
# Split the df into train, validation, and test sets
train_df = df[df['ID'].isin(train_ids)]
val_df = df[df['ID'].isin(val_ids)]
test_df = df[df['ID'].isin(test_ids)]
# Create a data structure consisting of a dictionary of ids, ProbeID, and timepoints, for each dataset
train_dict = convert_df_to_dict(df, train_ids)
val_dict = convert_df_to_dict(df, val_ids, train_dict['probeids'], train_dict['months'])
test_dict = convert_df_to_dict(df, test_ids, train_dict['probeids'], train_dict['months'])

In [5]:
# Save the data structure to a pickle file
with open('data/3-9-48-72months_383CpGs_153indivs_train.pkl', 'wb') as f:
    pickle.dump(train_dict, f)
with open('data/3-9-48-72months_383CpGs_153indivs_val.pkl', 'wb') as f:
    pickle.dump(val_dict, f)
with open('data/3-9-48-72months_383CpGs_153indivs_test.pkl', 'wb') as f:
    pickle.dump(test_dict, f)

In [5]:
train_dict['probeids']

array(['cg00017842', 'cg00073460', 'cg00194146', 'cg00257455',
       'cg00343092', 'cg00347775', 'cg00431549', 'cg00454305',
       'cg00503840', 'cg00522231', 'cg00582628', 'cg00593462',
       'cg00658652', 'cg00702638', 'cg00748589', 'cg00753885',
       'cg00760938', 'cg00795927', 'cg00991848', 'cg01059398',
       'cg01243072', 'cg01262913', 'cg01447660', 'cg01459453',
       'cg01511232', 'cg01527307', 'cg01528542', 'cg01560871',
       'cg01570885', 'cg01620164', 'cg01740766', 'cg01752203',
       'cg01820374', 'cg01844642', 'cg01883408', 'cg01949403',
       'cg02071305', 'cg02085953', 'cg02228185', 'cg02244028',
       'cg02275294', 'cg02281167', 'cg02328239', 'cg02331561',
       'cg02383785', 'cg02532488', 'cg02650266', 'cg02821342',
       'cg02867102', 'cg02871659', 'cg02901139', 'cg03019000',
       'cg03032497', 'cg03183882', 'cg03361973', 'cg03364683',
       'cg03473532', 'cg03486383', 'cg03555227', 'cg03607117',
       'cg03643998', 'cg03670162', 'cg03746976', 'cg037