In [42]:
import pickle
import os
import pandas as pd
import numpy as np

"""
This script takes the progressions dataset saved previously and reconstructs it into a numpy array where each
row is an RID and each column is a certain measurement on a certain visit.
The array is going to have a lot of NaNs in it since some of the patients have more visits than others.
"""

# Define the path to the pickle file
file_path = os.path.join('Datasets', 'Embeddings', 'your_pickle_file.pkl')

# Load the pickle file
with open('Datasets/Distance Matricies/Progression Variables 2024-11-09_DX_bl_wrapped euclidean_selection-ADNI1_2025-Jan-21-@-01-44/distances.pkl', 'rb') as file:
    ADNI_ds = pickle.load(file)

df = ADNI_ds.progressions_dataframe

# Group by RID and re-index each group to the maximum size
grouped = df.groupby('RID')
max_size = grouped.size().max()

# Apply reindex to ensure uniform size for all groups
reindexed = grouped.apply(
    lambda x: x.reset_index(drop=True).reindex(range(max_size))
)

# Reshape to wide format
reshaped = reindexed.unstack()

# Convert MultiIndex column tuples into just string column labels
reshaped.columns = ['{}_{}'.format(col, i) for col, i in reshaped.columns]

reshaped = np.array(reshaped)

print(reshaped)



[[4.33 6.33 6.   ...  nan  nan  nan]
 [3.67 4.   2.67 ...  nan  nan  nan]
 [7.33 6.67 8.33 ...  nan  nan  nan]
 ...
 [2.67 2.33 4.   ...  nan  nan  nan]
 [4.   3.67 3.33 ...  nan  nan  nan]
 [5.   5.67 5.67 ...  nan  nan  nan]]


In [2]:
print([[letter, number] for letter in 'abcd' for number in range(4)])

[['a', 0], ['a', 1], ['a', 2], ['a', 3], ['b', 0], ['b', 1], ['b', 2], ['b', 3], ['c', 0], ['c', 1], ['c', 2], ['c', 3], ['d', 0], ['d', 1], ['d', 2], ['d', 3]]
