In [168]:
import pandas as pd
import numpy as np
import scipy

In [169]:
def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [191]:
def calculate_distribution(dataset_reference, demographics_reference):

    dataset_reference = pd.read_csv(dataset_reference)

    images_cn_all = dataset_reference.query("CLASS == 0")
    images_cn_train = dataset_reference.query("CLASS == 0 and TYPE == 'TRAIN'")
    images_cn_val = dataset_reference.query("CLASS == 0 and TYPE == 'VAL'")
    images_cn_test = dataset_reference.query("CLASS == 0 and TYPE == 'TEST'")

    images_smci_all = dataset_reference.query("CLASS == 2")
    images_smci_train = dataset_reference.query("CLASS == 2 and TYPE == 'TRAIN'")
    images_smci_val = dataset_reference.query("CLASS == 2 and TYPE == 'VAL'")
    images_smci_test = dataset_reference.query("CLASS == 2 and TYPE == 'TEST'")

    images_pmci_all = dataset_reference.query("CLASS == 3")
    images_pmci_train = dataset_reference.query("CLASS == 3 and TYPE == 'TRAIN'")
    images_pmci_val = dataset_reference.query("CLASS == 3 and TYPE == 'VAL'")
    images_pmci_test = dataset_reference.query("CLASS == 3 and TYPE == 'TEST'")

    images_ad_all = dataset_reference.query("CLASS == 1")
    images_ad_train = dataset_reference.query("CLASS == 1 and TYPE == 'TRAIN'")
    images_ad_val = dataset_reference.query("CLASS == 1 and TYPE == 'VAL'")
    images_ad_test = dataset_reference.query("CLASS == 1 and TYPE == 'TEST'")

    get_demographics(images_cn_all, demographics_reference, "CN ALL")
    get_demographics(images_cn_train, demographics_reference, "CN TRAIN")
    get_demographics(images_cn_val, demographics_reference, "CN VALIDATION")
    get_demographics(images_cn_test, demographics_reference, "CN TEST")

    get_demographics(images_smci_all, demographics_reference, "sMCI ALL")
    get_demographics(images_smci_train, demographics_reference, "sMCI TRAIN")
    get_demographics(images_smci_val, demographics_reference, "sMCI VALIDATION")
    get_demographics(images_smci_test, demographics_reference, "sMCI TEST")

    get_demographics(images_pmci_all, demographics_reference, "pMCI AL")
    get_demographics(images_pmci_train, demographics_reference, "pMCI TRAIN")
    get_demographics(images_pmci_val, demographics_reference, "pMCI VALIDATION")
    get_demographics(images_pmci_test, demographics_reference, "pMCI TEST")

    get_demographics(images_ad_all, demographics_reference, "AD ALL")
    get_demographics(images_ad_train, demographics_reference, "AD TRAIN")
    get_demographics(images_ad_val, demographics_reference, "AD VALIDATION")
    get_demographics(images_ad_test, demographics_reference, "AD TEST")



def get_demographics(dataset_reference, demographics_reference, name):

    demographics_reference = pd.read_csv(demographics_reference)

    ages = list()
    sexes = list()

    for row in dataset_reference.iterrows():

        image_id = "I" + str(row[1].iloc[0])

        data = demographics_reference.query('`Image Data ID` == @image_id')

        if not data.empty:
            ages.append(data['Age'].values[0])
            sexes.append(data['Sex'].values[0])
    
    print(f'\nDemographics for {name} IMAGES')
            
    print("Mean Ages:   %1.3f ±%1.3f" %(np.array(ages).mean(), np.array(ages).std()))
    
    unique, counts = np.unique(sexes, return_counts=True)

    print('Count ('+unique[0]+'/'+unique[1]+'): ' + str(counts[0]+counts[1]) + '(' + str(counts[0]) + '/' + str(counts[1]) + ')\n')

    ages = list()
    sexes = list()

    dataset_reference_unique = dataset_reference.drop_duplicates(keep='first', subset=["PATIENT_ID"])

    for row in dataset_reference_unique.iterrows():

        image_id = "I" + str(row[1].iloc[0])

        data = demographics_reference.query('`Image Data ID` == @image_id')

        if not data.empty:
            ages.append(data['Age'].values[0])
            sexes.append(data['Sex'].values[0])
    
    print(f'Demographics for {name} PATIENTS')
            
    print("Mean Ages:   %1.3f ±%1.3f" %(np.array(ages).mean(), np.array(ages).std()))
    
    unique, counts = np.unique(sexes, return_counts=True)

    print('Count ('+unique[0]+'/'+unique[1]+'): ' + str(counts[0]+counts[1]) + '(' + str(counts[0]) + '/' + str(counts[1]) + ')\n')

    print('-------------------------------')


In [192]:
calculate_distribution('/Users/olath/Documents/GitHub/Master-thesis/Datasets/reference_all_classes_timewindow_singular.csv',
                       '/Users/olath/Downloads/Download_collection_4_08_2025.csv')


Demographics for CN ALL IMAGES
Mean Ages:   76.528 ±6.683
Count (F/M): 1701(877/824)

Demographics for CN ALL PATIENTS
Mean Ages:   74.282 ±6.474
Count (F/M): 305(158/147)

-------------------------------

Demographics for CN TRAIN IMAGES
Mean Ages:   76.764 ±6.837
Count (F/M): 1034(504/530)

Demographics for CN TRAIN PATIENTS
Mean Ages:   74.295 ±6.373
Count (F/M): 183(94/89)

-------------------------------

Demographics for CN VALIDATION IMAGES
Mean Ages:   76.097 ±6.397
Count (F/M): 361(196/165)

Demographics for CN VALIDATION PATIENTS
Mean Ages:   73.839 ±5.823
Count (F/M): 62(30/32)

-------------------------------

Demographics for CN TEST IMAGES
Mean Ages:   76.239 ±6.445
Count (F/M): 306(177/129)

Demographics for CN TEST PATIENTS
Mean Ages:   74.700 ±7.333
Count (F/M): 60(34/26)

-------------------------------

Demographics for sMCI ALL IMAGES
Mean Ages:   74.596 ±7.587
Count (F/M): 1635(708/927)

Demographics for sMCI ALL PATIENTS
Mean Ages:   73.400 ±7.486
Count (F/M): 29

In [62]:
demographics_reference = pd.read_csv('/Users/olath/Downloads/Download_collection_4_08_2025.csv')
demographics_reference

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I1619403,941_S_7106,MCI,F,72,sc,MRI,Accelerated Sagittal MPRAGE,Original,9/09/2022,DCM,12/12/2024
1,I1591321,941_S_7087,CN,M,67,sc,MRI,Accelerated Sagittal MPRAGE,Original,6/15/2022,DCM,12/11/2024
2,I1600180,941_S_7085,MCI,F,70,sc,MRI,Accelerated Sagittal MPRAGE,Original,7/01/2022,DCM,12/13/2024
3,I1588331,941_S_7074,CN,M,71,sc,MRI,Accelerated Sagittal MPRAGE,Original,5/03/2022,DCM,12/13/2024
4,I10283169,941_S_7074,CN,M,72,4_init,MRI,Accelerated Sagittal MPRAGE (MSV21),Original,9/18/2023,DCM,12/10/2024
...,...,...,...,...,...,...,...,...,...,...,...,...
18118,I55276,002_S_0295,CN,M,86,m12,MRI,MP-RAGE REPEAT,Original,5/25/2007,DCM,12/13/2024
18119,I238627,002_S_0295,CN,M,90,v06,MRI,MPRAGE,Original,6/02/2011,DCM,12/11/2024
18120,I13721,002_S_0295,CN,M,85,sc,MRI,MP-RAGE REPEAT,Original,4/18/2006,DCM,12/12/2024
18121,I114209,002_S_0295,CN,M,87,m24,MRI,MP-RAGE REPEAT,Original,7/23/2008,DCM,12/13/2024
