# Main Notebook

In [12]:
import os
import re
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [15]:
# Gets src path 
src_path = os.path.dirname(os.getcwd())

# Adds src_path if it doesn't exist in sys.path (to access utils)
if os.path.exists(src_path) and src_path not in sys.path:
    sys.path.append(src_path)
    
import src.utils.data_tools as dt

In [3]:
# Load datasets

df_main = pd.read_csv('./src/data/processed/main_data.csv')
df_bm = pd.read_csv('./src/data/processed/biomarker_data.csv')
df_adni = pd.read_csv('./src/data/processed/adni_data.csv')

## General overview

In [4]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Patient_ID                   2149 non-null   object 
 1   Age                          2149 non-null   int64  
 2   Gender                       2149 non-null   int64  
 3   Ethnicity                    2149 non-null   object 
 4   Education_lv                 1703 non-null   object 
 5   BMI                          2149 non-null   float64
 6   Smoking                      2149 non-null   int64  
 7   Alcohol_Consumption          2149 non-null   float64
 8   Physical_Activity            2149 non-null   float64
 9   Diet_Quality                 2149 non-null   float64
 10  Sleep_Quality                2149 non-null   float64
 11  Family_History_Alzheimers    2149 non-null   int64  
 12  CVD                          2149 non-null   int64  
 13  Diabetes          

In [5]:
# Correct 'None' interpretation in df_main['Education_lv']

df_main['Education_lv'] = df_main['Education_lv'].fillna('None')

In [6]:
df_main.describe()

Unnamed: 0,Age,Gender,BMI,Smoking,Alcohol_Consumption,Physical_Activity,Diet_Quality,Sleep_Quality,Family_History_Alzheimers,CVD,...,Functional_Assessment,Memory_Complaints,Behavioral_Problems,ADL,Confusion,Disorientation,Personality_Changes,Difficulty_Completing_Tasks,Forgetfulness,DX
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,74.908795,0.506282,27.655697,0.288506,10.039442,4.920202,4.993138,7.051081,0.25221,0.144253,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,8.990221,0.500077,7.217438,0.453173,5.75791,2.857191,2.909055,1.763573,0.434382,0.351428,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,60.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,4.002629,0.0,0.0,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,67.0,0.0,21.611408,0.0,5.13981,2.570626,2.458455,5.482997,0.0,0.0,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,75.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,7.115646,0.0,0.0,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,83.0,1.0,33.869778,1.0,15.157931,7.427899,7.558625,8.562521,1.0,0.0,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,90.0,1.0,39.992767,1.0,19.989293,9.987429,9.998346,9.99984,1.0,1.0,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
df_bm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113 entries, 0 to 112
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      113 non-null    object 
 1   Gender          113 non-null    int64  
 2   Age             113 non-null    int64  
 3   BMI             113 non-null    float64
 4   Education_yrs   113 non-null    int64  
 5   Smoking         113 non-null    int64  
 6   Drinking        113 non-null    int64  
 7   Hypertension    113 non-null    int64  
 8   CVD             113 non-null    int64  
 9   Diabetes        113 non-null    int64  
 10  MMSE            113 non-null    int64  
 11  MOCA            113 non-null    int64  
 12  DX              113 non-null    object 
 13  Plasma_GFAP     113 non-null    float64
 14  Plasma_NfL      113 non-null    float64
 15  Plasma_ptau181  113 non-null    float64
dtypes: float64(4), int64(10), object(2)
memory usage: 14.3+ KB


In [8]:
df_bm.describe()

Unnamed: 0,Gender,Age,BMI,Education_yrs,Smoking,Drinking,Hypertension,CVD,Diabetes,MMSE,MOCA,Plasma_GFAP,Plasma_NfL,Plasma_ptau181
count,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0,113.0
mean,0.575221,66.460177,23.632015,9.955752,0.247788,0.19469,0.504425,0.088496,0.230088,25.637168,22.557522,177.090683,30.855488,2.729697
std,0.496511,10.603264,3.309191,3.616405,0.433651,0.397726,0.502208,0.285279,0.422764,6.053426,6.834704,108.827835,23.089488,1.947522
min,0.0,47.0,15.464138,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,27.646618,4.885645,0.329745
25%,0.0,58.0,21.453287,8.0,0.0,0.0,0.0,0.0,0.0,25.0,20.0,104.172007,16.514134,1.53
50%,1.0,67.0,23.4375,10.0,0.0,0.0,1.0,0.0,0.0,28.0,25.0,158.918462,22.731267,2.162603
75%,1.0,74.0,25.721032,12.0,0.0,0.0,1.0,0.0,0.0,29.0,28.0,210.718853,36.809848,3.530901
max,1.0,92.0,32.466181,16.0,1.0,1.0,1.0,1.0,1.0,30.0,30.0,755.55374,143.351322,14.65048


In [9]:
df_adni.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 767 entries, 0 to 766
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Patient_ID      767 non-null    object 
 1   Age             767 non-null    float64
 2   Gender          767 non-null    int64  
 3   Ethnicity       767 non-null    object 
 4   Education_yrs   767 non-null    int64  
 5   Smoking         645 non-null    float64
 6   CVD             645 non-null    float64
 7   Diabetes        645 non-null    float64
 8   Depression      645 non-null    float64
 9   Hypertension    645 non-null    float64
 10  Stroke          645 non-null    float64
 11  MMSE            767 non-null    int64  
 12  MOCA            757 non-null    float64
 13  APOE4           767 non-null    int64  
 14  DX              767 non-null    object 
 15  Group           767 non-null    object 
 16  Plasma_ptau181  767 non-null    float64
dtypes: float64(9), int64(4), object(4)


In [10]:
df_adni.describe()

Unnamed: 0,Age,Gender,Education_yrs,Smoking,CVD,Diabetes,Depression,Hypertension,Stroke,MMSE,MOCA,APOE4,Plasma_ptau181
count,767.0,767.0,767.0,645.0,645.0,645.0,645.0,645.0,645.0,767.0,757.0,767.0,767.0
mean,72.305346,0.477184,16.314211,0.017054,0.67907,0.12093,0.277519,0.541085,0.046512,27.757497,23.463672,0.535854,16.863807
std,7.116713,0.499805,2.604795,0.129574,0.467196,0.326299,0.448122,0.498696,0.210754,2.44279,3.990772,0.653135,8.863563
min,55.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,4.0,0.0,0.833
25%,67.4,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,21.0,0.0,10.5465
50%,72.3,0.0,16.0,0.0,1.0,0.0,0.0,1.0,0.0,29.0,24.0,0.0,15.182
75%,77.2,1.0,18.0,0.0,1.0,0.0,1.0,1.0,0.0,30.0,26.0,1.0,22.2835
max,91.4,1.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,30.0,30.0,2.0,48.854


## Categorical variables

## Numerical variables