In [2]:
# import packages
import pandas as pd
import numpy as np

from sklearn.utils import resample

from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
# get the data
data= pd.read_csv('alzheimers_disease_data.csv')

In [16]:
# EDA
data.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [18]:
data.shape

(2149, 33)

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>

RangeIndex: 2149 entries, 0 to 2148

Data columns (total 33 columns):

 #   Column                     Non-Null Count  Dtype  

---  ------                     --------------  -----  

 0   Age                        2149 non-null   int64  

 1   Gender                     2149 non-null   int64  

 2   Ethnicity                  2149 non-null   int64  

 3   EducationLevel             2149 non-null   int64  

 4   BMI                        2149 non-null   float64

 5   Smoking                    2149 non-null   int64  

 6   AlcoholConsumption         2149 non-null   float64

 7   PhysicalActivity           2149 non-null   float64

 8   DietQuality                2149 non-null   float64

 9   SleepQuality               2149 non-null   float64

 10  FamilyHistoryAlzheimers    2149 non-null   int64  

 11  CardiovascularDisease      2149 non-null   int64  

 12  Diabetes                   2149 non-null   int64  

 13  Depression                 214

In [22]:
data.isna().sum()

Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfulness                0
Diagnosi

In [14]:
# drop -> PatientID, DoctorInCharge
data.drop(columns=['DoctorInCharge', 'PatientID' ], inplace=True)

In [24]:
data['Diagnosis'].value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [26]:
# imbalanced dataset, so upsample the minority
majority_data= data[ data['Diagnosis'] == 0 ]
minority_data= data[ data['Diagnosis'] == 1 ]

In [28]:
majority_data.shape, minority_data.shape

((1389, 33), (760, 33))

In [32]:
# Upsample minority data
upsampled_minority_data = resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)

# Reset the index of the upsampled minority data
upsampled_minority_data = upsampled_minority_data.reset_index(drop=True)

# Reset index for majority data too (if necessary)
majority_data = majority_data.reset_index(drop=True)

# Combine the datasets into balanced_data
balanced_data = pd.concat([majority_data, upsampled_minority_data]).reset_index(drop=True)

In [34]:
balanced_data= balanced_data.sample(frac=1)

In [36]:
balanced_data.shape

(2778, 33)

In [40]:
balanced_data['Diagnosis'].value_counts()

Diagnosis
0    1389
1    1389
Name: count, dtype: int64

In [42]:
# balanced_data := EDA
balanced_data.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
278,83,1,0,1,35.953739,0,12.005345,4.647637,6.727057,6.290966,...,2.199299,0,0,5.076804,0,0,1,1,1,0
252,80,0,1,3,17.095404,1,14.99059,3.120002,7.48736,8.402307,...,7.81956,0,1,8.248209,0,0,0,0,1,0
1514,87,0,0,0,32.12192,1,7.317791,3.246542,2.756334,9.78096,...,4.099852,1,0,7.751521,0,0,0,0,0,1
2365,70,0,0,1,16.585036,0,3.206609,7.465421,4.098699,5.364626,...,8.748155,1,0,0.849728,0,0,0,0,1,1
1398,76,1,2,0,33.829364,0,19.826313,9.399463,4.794106,4.976046,...,4.801436,0,0,2.718907,0,0,0,1,1,1


In [44]:
balanced_data.shape

(2778, 33)

In [46]:
balanced_data.info()

<class 'pandas.core.frame.DataFrame'>

Index: 2778 entries, 278 to 159

Data columns (total 33 columns):

 #   Column                     Non-Null Count  Dtype  

---  ------                     --------------  -----  

 0   Age                        2778 non-null   int64  

 1   Gender                     2778 non-null   int64  

 2   Ethnicity                  2778 non-null   int64  

 3   EducationLevel             2778 non-null   int64  

 4   BMI                        2778 non-null   float64

 5   Smoking                    2778 non-null   int64  

 6   AlcoholConsumption         2778 non-null   float64

 7   PhysicalActivity           2778 non-null   float64

 8   DietQuality                2778 non-null   float64

 9   SleepQuality               2778 non-null   float64

 10  FamilyHistoryAlzheimers    2778 non-null   int64  

 11  CardiovascularDisease      2778 non-null   int64  

 12  Diabetes                   2778 non-null   int64  

 13  Depression                 2778 no

In [48]:
balanced_data.isna().sum()

Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfulness                0
Diagnosi

In [70]:
# feature and labels

X= balanced_data.drop('Diagnosis', axis=1)
Y= balanced_data['Diagnosis']

In [72]:
# train test split
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size= 0.2)

In [74]:
# segegrate the features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [76]:
numeric_features

Index(['Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness'],
      dtype='object')

In [78]:
# numeric transformer := power transformer
numeric_transformer= Pipeline([
    ('power_transformer', PowerTransformer())
])

In [80]:
# creating the preprocessor
preprocessor= ColumnTransformer([
    ('numeric_transformer', numeric_transformer, numeric_features)
])

In [82]:
# model
model= LogisticRegression()

In [104]:
# pipeline
pipe= Pipeline([
    ('preprocessor', preprocessor),
    ('PCA', PCA(n_components=29)),
    ('model', model)
])

In [106]:
# fit
pipe.fit(X_train, y_train)

In [108]:
# predict
y_preds= pipe.predict(X_test)

In [110]:
# evaluate

In [112]:
accuracy_score(y_test, y_preds)

0.8129496402877698

In [114]:
precision_score(y_test, y_preds)

0.8195488721804511

In [116]:
recall_score(y_test, y_preds)

0.7956204379562044

In [118]:
f1_score(y_test, y_preds)

0.8074074074074075

In [102]:
# find the best params for PCA
max_acc= 0
best_comps= 0

for i in range(1, 33):

    pipe= Pipeline([
        ('preprocessor', preprocessor),
        ('PCA', PCA(n_components=i)),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    
    y_preds= pipe.predict(X_test)
    
    acc= accuracy_score(y_test, y_preds)

    if acc > max_acc:
        max_acc= acc
        best_comps= i

    print(f"{i} comps := {acc}")

print(f"\nBest comps: {best_comps}, Accuracy: {max_acc}")

1 comps := 0.6205035971223022

2 comps := 0.6384892086330936

3 comps := 0.6672661870503597

4 comps := 0.6366906474820144

5 comps := 0.6312949640287769

6 comps := 0.670863309352518

7 comps := 0.6636690647482014

8 comps := 0.6816546762589928

9 comps := 0.6402877697841727

10 comps := 0.6528776978417267

11 comps := 0.6546762589928058

12 comps := 0.6654676258992805

13 comps := 0.7032374100719424

14 comps := 0.6546762589928058

15 comps := 0.6870503597122302

16 comps := 0.710431654676259

17 comps := 0.6834532374100719

18 comps := 0.737410071942446

19 comps := 0.7338129496402878

20 comps := 0.7248201438848921

21 comps := 0.7517985611510791

22 comps := 0.7464028776978417

23 comps := 0.7410071942446043

24 comps := 0.75

25 comps := 0.7769784172661871

26 comps := 0.7967625899280576

27 comps := 0.7985611510791367

28 comps := 0.789568345323741

29 comps := 0.8129496402877698

30 comps := 0.802158273381295

31 comps := 0.8093525179856115

32 comps := 0.8039568345323741





