# Notebook 2

Categorical data analysis


#### Table of contents:
* 

## Imports

In [2]:
import os
import re
import sys

import numpy as np
import matplotlib.pyplot as plt
# import plotly
import pandas as pd
import seaborn as sns

pd.options.display.max_columns = None

In [3]:
# Gets src path 
src_path = os.path.dirname(os.getcwd())

# Adds src_path if it doesn't exist in sys.path (to access utils)
if os.path.exists(src_path) and src_path not in sys.path:
    sys.path.append(src_path)
    
import utils.functions as fn

In [4]:
# Load datasets

df_main = pd.read_csv('../data/processed/main_data.csv')
df_bm = pd.read_csv('../data/processed/biomarker_data.csv')
df_adni = pd.read_csv('../data/processed/adni_data.csv')

## General overview

In [5]:
df_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 34 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Patient_ID                   2149 non-null   object 
 1   Age                          2149 non-null   int64  
 2   Gender                       2149 non-null   int64  
 3   Ethnicity                    2149 non-null   object 
 4   Education_lv                 1703 non-null   object 
 5   BMI                          2149 non-null   float64
 6   Smoking                      2149 non-null   int64  
 7   Alcohol_Consumption          2149 non-null   float64
 8   Physical_Activity            2149 non-null   float64
 9   Diet_Quality                 2149 non-null   float64
 10  Sleep_Quality                2149 non-null   float64
 11  Family_History_Alzheimers    2149 non-null   int64  
 12  CVD                          2149 non-null   int64  
 13  Diabetes          

In [6]:
# Correct 'None' interpretation in df_main['Education_lv']

df_main['Education_lv'] = df_main['Education_lv'].fillna('None')

In [28]:
# Check common columns in between datasets to see which can be compared in between them

main_bm = df_main.columns.intersection(df_bm.columns).to_list()
main_adni = df_main.columns.intersection(df_adni.columns).to_list()
bm_adni = df_bm.columns.intersection(df_adni.columns).to_list()
all_inters = df_main.columns.intersection(df_bm.columns.intersection(df_adni.columns)).to_list()

print('Main + biomarkers:', main_bm)
print('Main + adni:', main_adni)
print('Biomarkers + adni:', bm_adni)
print('All:', all_inters)

Main + biomarkers: ['Patient_ID', 'Age', 'Gender', 'BMI', 'Smoking', 'CVD', 'Diabetes', 'Hypertension', 'MMSE', 'DX']
Main + adni: ['Patient_ID', 'Age', 'Gender', 'Ethnicity', 'Smoking', 'CVD', 'Diabetes', 'Depression', 'Hypertension', 'MMSE', 'DX']
Biomarkers + adni: ['Patient_ID', 'Gender', 'Age', 'Education_yrs', 'Smoking', 'Hypertension', 'CVD', 'Diabetes', 'MMSE', 'MOCA', 'DX', 'Plasma_ptau181']
All: ['Patient_ID', 'Age', 'Gender', 'Smoking', 'CVD', 'Diabetes', 'Hypertension', 'MMSE', 'DX']


In [18]:
# df_combined = pd.concat([df_main, df_bm, df_adni], axis = 0, ignore_index = True)

In [8]:
col_gen = ['Patient_ID', 'Age', 'Gender', 'Ethnicity', 'Education_lv', 'Education_yrs']
col_lifestyle = ['BMI', 'Smoking', 'Drinking', 'Alcohol_Consumption', 'Physical_Activity', 'Diet_Quality', 'Sleep_Quality']
col_clinical_history = ['APOE4', 'Family_History_Alzheimers', 'CVD', 'Hypertension', 'Diabetes', 'Depression', 'Stroke', 'Head_Injury']
col_clinical_measurements = ['Systolic_BP', 'Diastolic_BP', 'Cholesterol_Total', 'Cholesterol_LDL', 'Cholesterol_HDL', 'Cholesterol_Triglycerides', 'Plasma_GFAP', 'Plasma_NfL', 'Plasma_ptau181']
col_as = ['MMSE', 'MOCA', 'Functional_Assessment', 'ADL']

## Categorical variables

In [None]:
custom_labels = {
    'Gender': ['Male', 'Female'],
    'Ethnicity': ['Caucasian', 'African American', 'Asian', 'Other'],
    'EducationLevel': ['None', 'High School', 'Bachelor\'s', 'Higher'],
    'Smoking': ['No', 'Yes'],
    'FamilyHistoryAlzheimers': ['No', 'Yes'],
    'CardiovascularDisease': ['No', 'Yes'],
    'Diabetes': ['No', 'Yes'],
    'Depression': ['No', 'Yes'],
    'HeadInjury': ['No', 'Yes'],
    'Hypertension': ['No', 'Yes'],
    'MemoryComplaints': ['No', 'Yes'],
    'BehavioralProblems': ['No', 'Yes'],
    'Confusion': ['No', 'Yes'],
    'Disorientation': ['No', 'Yes'],
    'PersonalityChanges': ['No', 'Yes'],
    'DifficultyCompletingTasks': ['No', 'Yes'],
    'Forgetfulness': ['No', 'Yes']
}