# Data preparation

In this notebook, each file from the `/data/raw` folder will be processed using the following steps:

1. Dataframe general overview.
2. Column processing: drop, rename, order.
3. Missing data treatment.

Then, if possible, datasets will be merged and saved in the `/data/processed` folder.  

Lastly, a dataset card with all relevant information will be created for each dataset.

## 0. Imports

In [1]:
import os
import re
import sys

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

pd.options.display.max_columns = None

In [2]:
# Gets src path 
src_path = os.path.dirname(os.getcwd())

# Adds src_path if it doesn't exist in sys.path (to access utils)
if os.path.exists(src_path) and src_path not in sys.path:
    sys.path.append(src_path)
    
from utils.functions import classify_by_cardinality

## 1. Data cleaning

### Dataset 1: alzheimers_disease_data

In [3]:
# General overview

df_ad = pd.read_csv('../data/raw/alzheimers_disease_data.csv')
df_ad.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [4]:
# Column processing

# Column selection
df_ad.drop(columns = ['DoctorInCharge'], inplace = True)

In [5]:
# Transform categoric-numeric content to categorid-string
ethnicity = ['Caucasian', 'African American', 'Asian', 'Other']
education = ['None', 'High School', 'Bachelors', 'Higher']

df_ad['Ethnicity'] = df_ad['Ethnicity'].apply(lambda x: ethnicity[x])
df_ad['EducationLevel'] = df_ad['EducationLevel'].apply(lambda x: education[x])

In [6]:
df_ad

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,CardiovascularDisease,Diabetes,Depression,HeadInjury,Hypertension,SystolicBP,DiastolicBP,CholesterolTotal,CholesterolLDL,CholesterolHDL,CholesterolTriglycerides,MMSE,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,4751,73,0,Caucasian,Bachelors,22.927749,0,13.297218,6.327112,1.347214,9.025679,0,0,1,1,0,0,142,72,242.366840,56.150897,33.682563,162.189143,21.463532,6.518877,0,0,1.725883,0,0,0,1,0,0
1,4752,89,0,Caucasian,,26.827681,0,4.542524,7.619885,0.518767,7.151293,0,0,0,0,0,0,115,64,231.162595,193.407996,79.028477,294.630909,20.613267,7.118696,0,0,2.592424,0,0,0,0,1,0
2,4753,73,0,Other,High School,17.795882,0,19.555085,7.844988,1.826335,9.673574,1,0,0,0,0,0,99,116,284.181858,153.322762,69.772292,83.638324,7.356249,5.895077,0,0,7.119548,0,1,0,1,0,0
3,4754,74,1,Caucasian,High School,33.800817,1,12.209266,8.428001,7.435604,8.392554,0,0,0,0,0,0,118,115,159.582240,65.366637,68.457491,277.577358,13.991127,8.965106,0,1,6.481226,0,0,0,0,0,0
4,4755,89,0,Caucasian,,20.716974,0,18.454356,6.310461,0.795498,5.597238,0,0,0,0,0,0,94,117,237.602184,92.869700,56.874305,291.198780,13.517609,6.045039,0,0,0.014691,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,Caucasian,High School,39.121757,0,1.561126,4.049964,6.555306,7.535540,0,0,0,0,0,0,122,101,280.476824,94.870490,60.943092,234.520123,1.201190,0.238667,0,0,4.492838,1,0,0,0,0,1
2145,6896,75,0,Caucasian,Bachelors,17.857903,0,18.767261,1.360667,2.904662,8.555256,0,0,0,0,0,0,152,106,186.384436,95.410700,93.649735,367.986877,6.458060,8.687480,0,1,9.204952,0,0,0,0,0,1
2146,6897,77,0,Caucasian,High School,15.476479,0,4.594670,9.886002,8.120025,5.769464,0,0,0,0,0,0,115,118,237.024558,156.267294,99.678209,294.802338,17.011003,1.972137,0,0,5.036334,0,0,0,0,0,1
2147,6898,78,1,Other,High School,15.299911,0,8.674505,6.354282,1.263427,8.322874,0,1,0,0,0,0,103,96,242.197192,52.482961,81.281111,145.253746,4.030491,5.173891,0,0,3.785399,0,0,0,0,1,1


In [7]:
# Column rename
add_ = lambda s: re.sub(r'(?<=[a-z])([A-Z])', r'_\1', s)
df_ad.columns = [add_(col) for col in df_ad.columns]

new_col_ad = {
    'Cardiovascular_Disease': 'CVD',
    'Diagnosis': 'DX'
}

# df_ad.rename(columns = new_col_ad, inplace = True)

In [8]:
# Save clean dataset

# df_ad.to_csv('../data/processed/main_data.csv', index = False)

### Dataset 2: blood_marker

In [9]:
# General overview

df_biomarker = pd.read_excel('../data/raw/blood_marker.xlsx')
df_biomarker.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Name number                  113 non-null    object 
 1   sex                          113 non-null    object 
 2   age                          113 non-null    int64  
 3   Height/meter                 113 non-null    float64
 4   weight                       113 non-null    float64
 5   BMI                          113 non-null    float64
 6   education years              113 non-null    int64  
 7   smoking 1=yes 0=no           113 non-null    int64  
 8   Drinking 1=yes 0=no          113 non-null    int64  
 9   Hypertension 1=yes 0=no      113 non-null    int64  
 10  Coronary disease 1=yes 0=no  113 non-null    int64  
 11  Diabetes 1=yes 0=no          113 non-null    int64  
 12  MMSE Score                   113 non-null    int64  
 13  MoCA Score          

In [10]:
# Column processing

# Column selection
df_biomarker.drop(columns = ['Height/meter', 'weight'], inplace = True)

# Change 'sex' column type to binary
df_biomarker['sex'] = df_biomarker['sex'].apply(lambda x: 1 if x == 'female' else 0)

# Insert column for main diagnosis
df_biomarker.insert(12, 'DX', df_biomarker['Name number'].apply(lambda x: re.sub('[0-9]', '', x).strip()) ,allow_duplicates = True)

# Rename columns to match main_data
new_col_biomarker = {
    'Name number': 'Patient_ID',
    'sex': 'Gender',
    'age': 'Age',
    'education years': 'Education_yrs',
    'smoking 1=yes 0=no': 'Smoking',
    'Drinking 1=yes 0=no': 'Drinking',
    'Hypertension 1=yes 0=no': 'Hypertension',
    'Coronary disease 1=yes 0=no': 'CVD',
    'Diabetes 1=yes 0=no': 'Diabetes',
    'MMSE Score': 'MMSE',
    'MoCA Score': 'MOCA',
    'Plasma GFAP': 'Plasma_GFAP',
    'Plasma NfL': 'Plasma_NfL',
    'Plasma p-tau181': 'Plasma_ptau181'
}
df_biomarker.rename(columns = new_col_biomarker, inplace = True)

# Transform Patient_ID to identify source dataset
df_biomarker['Patient_ID'] = df_biomarker['Patient_ID'].apply(lambda x: f'bm_{x}')

In [11]:
df_biomarker

Unnamed: 0,Patient_ID,Gender,Age,BMI,Education_yrs,Smoking,Drinking,Hypertension,CVD,Diabetes,MMSE,MOCA,DX,Plasma_GFAP,Plasma_NfL,Plasma_ptau181
0,bm_CU1,1,71,22.656250,10,0,0,1,0,1,28,23,CU,187.788983,44.382631,3.530901
1,bm_CU2,0,61,22.093170,14,1,1,1,0,0,30,30,CU,129.526091,13.127498,2.684318
2,bm_CU3,0,55,25.734393,10,1,1,0,0,0,29,27,CU,57.363792,10.554058,2.670783
3,bm_CU4,1,53,19.879103,10,0,0,0,0,0,30,28,CU,88.835118,16.894295,1.310089
4,bm_CU5,0,74,25.711662,7,1,0,1,0,1,30,27,CU,160.402572,25.697172,3.562334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,bm_AD11,1,77,19.531250,8,0,0,0,0,1,12,10,AD,158.630000,58.310000,2.830000
109,bm_AD12,1,75,24.444444,12,0,0,0,0,0,24,20,AD,180.050000,23.010000,2.210000
110,bm_AD13,0,81,22.491349,16,1,1,0,0,0,22,15,AD,295.310000,69.180000,3.490000
111,bm_AD14,0,90,20.399714,16,1,1,1,0,1,22,18,AD,377.460000,54.290000,3.040000


In [12]:
# Save clean dataset

df_biomarker.to_csv('../data/processed/biomarker_data.csv', index = False)

### Dataset 3: baseline_data

In [13]:
# General overview

df_bl = pd.read_csv('../data/raw/baseline_data.csv', index_col = 0)
df_bl.info()

<class 'pandas.core.frame.DataFrame'>
Index: 767 entries, 1 to 3129
Data columns (total 57 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   RID                 767 non-null    int64  
 1   VISCODE             767 non-null    object 
 2   time                767 non-null    int64  
 3   BLPLASMA            767 non-null    float64
 4   DXCHANGE            767 non-null    int64  
 5   GROUP_abeta         767 non-null    object 
 6   DX                  767 non-null    int64  
 7   AGE                 767 non-null    float64
 8   PTGENDER            767 non-null    int64  
 9   PTEDUCAT            767 non-null    int64  
 10  APOE4               767 non-null    int64  
 11  GROUP               767 non-null    object 
 12  DX_bl               767 non-null    object 
 13  DXX                 767 non-null    object 
 14  EXAMDATE.key        767 non-null    object 
 15  PLASMAPTAU181       767 non-null    float64
 16  FDG         

In [14]:
# Column processing

# Column selection
keep_col = ['RID', 'AGE', 'PTGENDER', 'PTEDUCAT', 'smoking', 'CVD', 'DM2', 'Current.Depression', 'Hypertension', 'stroke', 'MMSE', 'MOCA', 'APOE4', 'GROUP', 'DXX', 'PLASMAPTAU181']
df_bl = df_bl[keep_col]

# Transform GROUP codification
group_dict = {
    'A': 'CU-',
    'B': 'CU+',
    'C': 'MCI+',
    'DF': 'AD',
    'E': 'MCI-'
}
df_bl['GROUP'] = df_bl['GROUP'].apply(lambda x: group_dict[x])

# Update DXX codification
df_bl['DXX'] = df_bl['DXX'].apply(lambda x: 'CU' if x == 'CN' else x)

# Rename columns to match main_data
df_bl.columns = [col.lower().capitalize() if col not in ['RID', 'CVD', 'MMSE', 'MOCA', 'APOE4'] else col for col in df_bl.columns]
new_col_bl = {
    'Ptgender': 'Gender',
    'Pteducat': 'Education_yr',
    'Dm2': 'Diabetes',
    'Current.depression': 'Depression',
    'Dxx': 'DX',
    'Plasmaptau181': 'Plasma_ptau181'
}
df_bl.rename(columns = new_col_bl, inplace = True)

# Sort values by 'RID'
df_bl = df_bl.sort_values(by = 'RID')

In [15]:
df_bl

Unnamed: 0,RID,Age,Gender,Education_yr,Smoking,CVD,Diabetes,Depression,Hypertension,Stroke,MMSE,MOCA,APOE4,Group,DX,Plasma_ptau181
1,2002,64.8,0,16,1.0,1.0,0.0,0.0,0.0,0.0,28,28.0,0,MCI-,MCI,6.777
7,2007,83.4,1,20,0.0,0.0,1.0,0.0,1.0,0.0,29,23.0,0,MCI+,MCI,37.897
12,2010,62.9,1,20,,,,,,,30,27.0,1,MCI+,MCI,23.263
17,2018,76.4,1,18,0.0,1.0,0.0,0.0,1.0,0.0,29,26.0,0,MCI-,MCI,10.252
21,2022,66.0,0,18,0.0,1.0,0.0,0.0,1.0,0.0,29,25.0,1,MCI+,MCI,16.576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116,5289,59.7,1,16,,,,,,,29,27.0,1,CU-,CU,8.672
3120,5290,67.0,1,12,0.0,1.0,0.0,1.0,1.0,0.0,29,25.0,1,CU-,CU,18.583
3123,5292,74.3,1,13,0.0,0.0,0.0,1.0,0.0,0.0,30,29.0,0,CU-,CU,17.408
3127,5295,75.5,1,15,,,,,,,29,27.0,1,CU-,CU,10.932


### Dataset 4: ADNIMERGE

In [16]:
# General overview

df_adni = pd.read_csv('../data/raw/ADNIMERGE.csv', low_memory = False)
print(df_adni.info())
df_adni.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14878 entries, 0 to 14877
Columns: 113 entries, RID to update_stamp
dtypes: float64(87), int64(5), object(21)
memory usage: 12.8+ MB
None


Unnamed: 0,RID,PTID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,PTEDUCAT,PTETHCAT,PTRACCAT,PTMARRY,APOE4,FDG,PIB,AV45,ABETA,TAU,PTAU,CDRSB,ADAS11,ADAS13,ADASQ4,MMSE,RAVLT_immediate,RAVLT_learning,RAVLT_forgetting,RAVLT_perc_forgetting,LDELTOTAL,DIGITSCOR,TRABSCOR,FAQ,MOCA,EcogPtMem,EcogPtLang,EcogPtVisspat,EcogPtPlan,EcogPtOrgan,EcogPtDivatt,EcogPtTotal,EcogSPMem,EcogSPLang,EcogSPVisspat,EcogSPPlan,EcogSPOrgan,EcogSPDivatt,EcogSPTotal,FLDSTRENG,FSVERSION,IMAGEUID,Ventricles,Hippocampus,WholeBrain,Entorhinal,Fusiform,MidTemp,ICV,DX,mPACCdigit,mPACCtrailsB,EXAMDATE_bl,CDRSB_bl,ADAS11_bl,ADAS13_bl,ADASQ4_bl,MMSE_bl,RAVLT_immediate_bl,RAVLT_learning_bl,RAVLT_forgetting_bl,RAVLT_perc_forgetting_bl,LDELTOTAL_BL,DIGITSCOR_bl,TRABSCOR_bl,FAQ_bl,mPACCdigit_bl,mPACCtrailsB_bl,FLDSTRENG_bl,FSVERSION_bl,Ventricles_bl,Hippocampus_bl,WholeBrain_bl,Entorhinal_bl,Fusiform_bl,MidTemp_bl,ICV_bl,MOCA_bl,EcogPtMem_bl,EcogPtLang_bl,EcogPtVisspat_bl,EcogPtPlan_bl,EcogPtOrgan_bl,EcogPtDivatt_bl,EcogPtTotal_bl,EcogSPMem_bl,EcogSPLang_bl,EcogSPVisspat_bl,EcogSPPlan_bl,EcogSPOrgan_bl,EcogSPDivatt_bl,EcogSPTotal_bl,ABETA_bl,TAU_bl,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp
0,2,011_S_0002,bl,11,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,16,Not Hisp/Latino,White,Married,0.0,1.36665,,,,,,0.0,10.67,18.67,5.0,28.0,44.0,4.0,6.0,54.5455,10.0,34.0,112.0,0.0,,,,,,,,,,,,,,,,,Cross-Sectional FreeSurfer (FreeSurfer Version...,35475.0,118233.0,8336.0,1229740.0,4177.0,16559.0,27936.0,1984660.0,CN,-4.41296,-4.23701,2005-09-08,0.0,10.67,18.67,5.0,28,44.0,4.0,6.0,54.5455,10.0,34.0,112.0,0.0,-4.41296,-4.23701,,Cross-Sectional FreeSurfer (FreeSurfer Version...,118233.0,8336.0,1229740.0,4177.0,16559.0,27936.0,1984660.0,,,,,,,,,,,,,,,,,,,1.36665,,,0.0,0.0,0,0.0,2020-01-09 04:20:17.0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,2005-09-12,AD,81.3,Male,18,Not Hisp/Latino,White,Married,1.0,1.08355,,,741.5,239.7,22.83,4.5,22.0,31.0,8.0,20.0,22.0,1.0,4.0,100.0,2.0,25.0,148.0,10.0,,,,,,,,,,,,,,,,,Cross-Sectional FreeSurfer (FreeSurfer Version...,32237.0,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,Dementia,-16.6283,-16.236,2005-09-12,4.5,22.0,31.0,8.0,20,22.0,1.0,4.0,100.0,2.0,25.0,148.0,10.0,-16.6283,-16.236,,Cross-Sectional FreeSurfer (FreeSurfer Version...,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,,,,,,,,,,,,,,,,741.5,239.7,22.83,1.08355,,,0.0,0.0,0,0.0,2020-01-09 04:20:17.0
2,3,011_S_0003,m06,11,ADNI1,ADNI1,2006-03-13,AD,81.3,Male,18,Not Hisp/Latino,White,Married,1.0,1.05803,,,,,,6.0,19.0,30.0,10.0,24.0,19.0,2.0,6.0,100.0,,19.0,135.0,12.0,,,,,,,,,,,,,,,,,Cross-Sectional FreeSurfer (FreeSurfer Version...,31863.0,88580.0,5446.0,1100060.0,2427.0,14400.0,16972.0,1906430.0,Dementia,-15.0969,-13.4965,2005-09-12,4.5,22.0,31.0,8.0,20,22.0,1.0,4.0,100.0,2.0,25.0,148.0,10.0,-16.6283,-16.236,,Cross-Sectional FreeSurfer (FreeSurfer Version...,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,,,,,,,,,,,,,,,,741.5,239.7,22.83,1.08355,,,0.498289,5.96721,6,6.0,2020-03-25 15:43:58.0
3,3,011_S_0003,m12,11,ADNI1,ADNI1,2006-09-12,AD,81.3,Male,18,Not Hisp/Latino,White,Married,1.0,1.0969,,,601.4,251.7,24.18,3.5,24.0,35.0,10.0,17.0,31.0,2.0,7.0,100.0,0.0,21.0,126.0,17.0,,,,,,,,,,,,,,,,,Cross-Sectional FreeSurfer (FreeSurfer Version...,35576.0,90099.0,5157.0,1095640.0,1596.0,14617.0,17330.0,1903820.0,Dementia,-21.4635,-20.2944,2005-09-12,4.5,22.0,31.0,8.0,20,22.0,1.0,4.0,100.0,2.0,25.0,148.0,10.0,-16.6283,-16.236,,Cross-Sectional FreeSurfer (FreeSurfer Version...,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,,,,,,,,,,,,,,,,741.5,239.7,22.83,1.08355,,,0.999316,11.9672,12,12.0,2020-01-09 04:20:17.0
4,3,011_S_0003,m24,11,ADNI1,ADNI1,2007-09-12,AD,81.3,Male,18,Not Hisp/Latino,White,Married,1.0,1.03258,,,,,,8.0,25.67,37.67,10.0,19.0,23.0,1.0,5.0,100.0,0.0,16.0,275.0,14.0,,,,,,,,,,,,,,,,,Cross-Sectional FreeSurfer (FreeSurfer Version...,88252.0,97420.0,5139.0,1088560.0,1175.0,14033.0,16398.0,1903420.0,Dementia,-20.1366,-20.3461,2005-09-12,4.5,22.0,31.0,8.0,20,22.0,1.0,4.0,100.0,2.0,25.0,148.0,10.0,-16.6283,-16.236,,Cross-Sectional FreeSurfer (FreeSurfer Version...,84599.0,5319.0,1129830.0,1791.0,15506.0,18422.0,1920690.0,,,,,,,,,,,,,,,,741.5,239.7,22.83,1.08355,,,1.99863,23.9344,24,24.0,2020-01-09 04:20:17.0


In [17]:
# Column processing

# Column selection filtered by rows present in df_bl
df_col = df_adni[['RID', 'PTRACCAT']]
df_filtered = df_col.loc[df_col['RID'].isin(df_bl['RID'])]

# Drop duplicates and sort by 'RID'
df_filtered = df_filtered.drop_duplicates().sort_values(by = 'RID')

# Merge with df_bl to add ethnicity info
df_merged = df_bl.merge(df_filtered, on = 'RID', how = 'inner')

# Transform PTRACCAT to match main_data codification
ethnicity_dict = {
    'White': 'Caucasian',
    'Black': 'African American',
    'Asian': 'Asian'
}
df_merged['PTRACCAT'] = df_merged['PTRACCAT'].apply(lambda x: ethnicity_dict[x] if (x in ethnicity_dict.keys()) else 'Other')

# Rearrange columns to match main_data structure
df_adni_merged = df_merged[['RID', 'Age', 'Gender', 'PTRACCAT', 'Education_yr', 'Smoking', 'CVD', 'Diabetes',
       'Depression', 'Hypertension', 'Stroke', 'MMSE', 'MOCA', 'APOE4',
       'DX', 'Group', 'Plasma_ptau181']]

# Final column rename to match main_data
df_adni_merged.rename(columns = {'RID': 'Patient_ID', 'PTRACCAT': 'Ethnicity'}, inplace = True)

# Transform Patient_ID to identify source dataset
df_adni_merged['Patient_ID'] = df_adni_merged['Patient_ID'].apply(lambda x: f'adni_{x}')

In [18]:
df_adni_merged

Unnamed: 0,Patient_ID,Age,Gender,Ethnicity,Education_yr,Smoking,CVD,Diabetes,Depression,Hypertension,Stroke,MMSE,MOCA,APOE4,DX,Group,Plasma_ptau181
0,adni_2002,64.8,0,Caucasian,16,1.0,1.0,0.0,0.0,0.0,0.0,28,28.0,0,MCI,MCI-,6.777
1,adni_2007,83.4,1,Caucasian,20,0.0,0.0,1.0,0.0,1.0,0.0,29,23.0,0,MCI,MCI+,37.897
2,adni_2010,62.9,1,Other,20,,,,,,,30,27.0,1,MCI,MCI+,23.263
3,adni_2018,76.4,1,Caucasian,18,0.0,1.0,0.0,0.0,1.0,0.0,29,26.0,0,MCI,MCI-,10.252
4,adni_2022,66.0,0,Other,18,0.0,1.0,0.0,0.0,1.0,0.0,29,25.0,1,MCI,MCI+,16.576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,adni_5289,59.7,1,Caucasian,16,,,,,,,29,27.0,1,CU,CU-,8.672
763,adni_5290,67.0,1,Caucasian,12,0.0,1.0,0.0,1.0,1.0,0.0,29,25.0,1,CU,CU-,18.583
764,adni_5292,74.3,1,Caucasian,13,0.0,0.0,0.0,1.0,0.0,0.0,30,29.0,0,CU,CU-,17.408
765,adni_5295,75.5,1,Caucasian,15,,,,,,,29,27.0,1,CU,CU-,10.932


In [19]:
# Save clean dataset

df_adni_merged.to_csv('../data/processed/adni_data.csv', index = False)

## 2. Dataset Cards

#### 1. Dataset main_data

* **Dataset**: main_data.csv
* **Description**: This dataset contains health information for 2,149 patients. It includes demographic details, lifestyle factors, medical history, clinical measurements, cognitive and functional assessments, symptoms, and a diagnosis of Alzheimer's Disease.
* **Time frame**:  
* **Source**: [Alzheimer's Disease Dataset](https://www.kaggle.com/datasets/rabieelkharoua/alzheimers-disease-dataset)

</br>

| Column | Description | Variable type | Relevance | Notes |
|--------|-------------|---------------|-----------|-------|
| | | | | |
| | | | | |
| | | | | |
| | | | | |

#### 2. Dataset biomarker_data

* **Dataset**: biomarker_data.csv
* **Description**: This dataset contains information for 113 patients, each identified with alphanumeric IDs. It includes relevant diagnosis, demographic details, lifestyle factors, cognitive assessments and blood marker measurements.
* **Time frame**: unknown / 2024
* **Source**: [Blood marker data](https://figshare.com/articles/dataset/Blood_marker_data_XLSX/26316985?file=47733910) 

</br>

| Column         | Description                                | Variable type | Relevance | Notes |
|----------------|--------------------------------------------|---------------|-----------|-------|
| Patient_ID     | ID, 'bm_' and a number sequence            | Categorical   | -         | - |
| Gender         | Gender of the patient                      | Binary        | 3         | 0 = male, 1 = female |
| Age            | Age of the patient                         | Numeric       | 1         | - |
| BMI            | Body Mass Index of the patient             | Numeric       | 1         | - |
| Education_yr   | Number of years of formal education        | Numeric       | 3         | - |
| Smoking        | Smoking status                             | Binary        | 3         | 0 = no, 1 = yes |
| Drinking       | Alcohol consumption status                 | Binary        | 3         | 0 = no, 1 = yes |
| Hypertension   | Diagnosed hypertension                     | Binary        | 2         | 0 = no, 1 = yes |
| CVD            | Diagnosed cardiovascular (coronary) disease| Binary        | 2         | 0 = no, 1 = yes |
| Diabetes       | Diagnosed diabetes                         | Binary        | 2         | 0 = no, 1 = yes |
| MMSE           | Mini-Mental State Examination Score        | Numeric       | 0         | Range 0-30 |
| MOCA           | Montreal Cognitive Assessment Score        | Numeric       | 0         | Range 0-30 |
| DX             | Neurological classification of the patient | Categorical   | 0         | CU = Cognitively Unimpaired, MCI = Mild Cognitive Impairment, AD = Alzheimer's Disease|
| Plasma_GFAP    | GFAP plasma quantification (pg/mL)         | Numeric       | 1         | - |
| Plasma_NfL     | NfL plasma quantification (pg/mL)          | Numeric       | 1         | - |
| Plasma_ptau181 | P-tau181 plasma quantification (pg/mL)     | Numeric       | 0         | - |

In [20]:
classify_by_cardinality(df_biomarker)

Unnamed: 0,Cardinality,% Cardinality,Type,Suggested Type,Possible Index
Patient_ID,113,100.0,object,Categorical (id),True
Gender,2,1.769912,int64,Binary,False
Age,38,33.628319,int64,Numeric (continuous),False
BMI,108,95.575221,float64,Numeric (continuous),False
Education_yrs,11,9.734513,int64,Numeric (discrete),False
Smoking,2,1.769912,int64,Binary,False
Drinking,2,1.769912,int64,Binary,False
Hypertension,2,1.769912,int64,Binary,False
CVD,2,1.769912,int64,Binary,False
Diabetes,2,1.769912,int64,Binary,False


#### 3. Dataset adni_data

* **Dataset**: adni_data.csv
* **Description**: This dataset contains health information for 767 patients, each identified with alphanumeric IDs. The dataset includes demographic details, lifestyle factors, medical history, clinical measurements, cognitive assessments, and neurological classification.
* **Time frame**: 2010-2020
* **Source**: [Plasma p-tau181 Level Predicts Neurodegeneration and Progression to Alzheimer's Dementia: A Longitudinal Study](https://figshare.com/articles/dataset/Data_Sheet_1_Plasma_p-tau181_Level_Predicts_Neurodegeneration_and_Progression_to_Alzheimer_s_Dementia_A_Longitudinal_Study_ZIP/16576709?file=30681404)

</br>

| Column         | Description                                | Variable type | Relevance | Notes |
|----------------|--------------------------------------------|---------------|-----------|-------|
| Patient_ID     | ID, 'adni_' and a number sequence          | Categorical   | -         | - |
| Age            | Age of the patient                         | Numeric       | 1         | - |
| Gender         | Gender of the patient                      | Binary        | 3         | 0 = male, 1 = female |
| Ethnicity      | Ethnicity of the patient                   | Categorical   | 3         | Caucasian, African American, Asian and other |
| Education_yr   | Number of years of formal education        | Numeric       | 3         | - |
| Smoking        | Smoking status                             | Binary        | 3         | 0 = no, 1 = yes |
| CVD            | Diagnosed cardiovascular (coronary) disease| Binary        | 2         | 0 = no, 1 = yes |
| Diabetes       | Diagnosed diabetes                         | Binary        | 2         | 0 = no, 1 = yes |
| Depression     | Current depression                         | Binary        | 2         | 0 = no, 1 = yes |
| Hypertension   | Diagnosed hypertension                     | Binary        | 2         | 0 = no, 1 = yes |
| Stroke         | History of a stroke                        | Binary        | 2         | 0 = no, 1 = yes |
| MMSE           | Mini-Mental State Examination Score        | Numeric       | 0         | Range 0-30 |
| MOCA           | Montreal Cognitive Assessment Score        | Numeric       | 0         | Range 0-30 |
| APOE4          | Carrier of APOE4 gen                       | Categorical   | 2         | 0 = no, 1 = HT, 2 = HZ |
| Group          | DX with specified $\beta$-amiloid (+ / -)  | Categorical   | 1         | - |
| DX             | Neurological classification of the patient | Categorical   | 0         | CU = Cognitively Unimpaired, MCI = Mild Cognitive Impairment, AD = Alzheimer's Disease|
| Plasma_ptau181 | P-tau181 plasma quantification (pg/mL)     | Numeric       | 0         | - |

In [21]:
# Timeframe specification
df = df_adni[['RID', 'EXAMDATE']]
df = df.loc[df['RID'].isin(df_bl['RID'])].sort_values(by = 'RID')
df['EXAMDATE'] = pd.to_datetime(df['EXAMDATE'])
df_unique = df.drop_duplicates(subset = 'RID', keep = 'first')
df_unique = df_unique.sort_values(by = 'EXAMDATE')
print('Beginning timeframe:', df_unique['EXAMDATE'].iloc[0])
print('End timeframe:', df_unique['EXAMDATE'].iloc[-1])

Beginning timeframe: 2010-07-06 00:00:00
End timeframe: 2020-08-04 00:00:00


In [22]:
classify_by_cardinality(df_adni_merged)

Unnamed: 0,Cardinality,% Cardinality,Type,Suggested Type,Possible Index
Patient_ID,767,100.0,object,Categorical (id),True
Age,266,34.680574,float64,Numeric (continuous),False
Gender,2,0.260756,int64,Binary,False
Ethnicity,4,0.521512,object,Categorical,False
Education_yr,13,1.694915,int64,Numeric (discrete),False
Smoking,2,0.260756,float64,Binary,False
CVD,2,0.260756,float64,Binary,False
Diabetes,2,0.260756,float64,Binary,False
Depression,2,0.260756,float64,Binary,False
Hypertension,2,0.260756,float64,Binary,False
