## About the Dataset:

<p align="justify">
This dataset contains extensive health information for 2,149 patients, each uniquely identified with IDs ranging from 4751 to 6900. The dataset includes demographic details, lifestyle factors, medical history, clinical measurements, cognitive and functional assessments, symptoms, and a diagnosis of Alzheimer's Disease. The data is ideal for researchers and data scientists looking to explore factors associated with Alzheimer's, develop predictive models, and conduct statistical analyses.
</p>

https://www.kaggle.com/datasets/rabieelkharoua/alzheimers-disease-dataset/data

In [207]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


## Data Reading and Processing

1.   Item da lista
2.   Item da lista



In [208]:
data = pd.read_csv("alzheimers_disease_data 2.csv", low_memory=False)

In [209]:
# Apply one-hot encoding on Ethnicity to avoid ordinal encoding

ethnicity_dummies = pd.get_dummies(data["Ethnicity"], prefix="Ethnicityi_").astype(int)
data.drop(["Ethnicity"], axis=1, inplace=True) 
data = pd.concat([data, ethnicity_dummies], axis=1)
data




Unnamed: 0,PatientID,Age,Gender,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge,Ethnicityi__0,Ethnicityi__1,Ethnicityi__2,Ethnicityi__3
0,4751,73,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,0,0,1,0,0,XXXConfid,1,0,0,0
1,4752,89,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,0,0,0,1,0,XXXConfid,1,0,0,0
2,4753,73,0,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,1,0,1,0,0,XXXConfid,0,0,0,1
3,4754,74,1,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,0,0,0,0,0,XXXConfid,1,0,0,0
4,4755,89,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,0,1,1,0,0,XXXConfid,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,1,39.121757,0,1.561126,4.049964,6.555306,7.535540,...,0,0,0,0,1,XXXConfid,1,0,0,0
2145,6896,75,0,2,17.857903,0,18.767261,1.360667,2.904662,8.555256,...,0,0,0,0,1,XXXConfid,1,0,0,0
2146,6897,77,0,1,15.476479,0,4.594670,9.886002,8.120025,5.769464,...,0,0,0,0,1,XXXConfid,1,0,0,0
2147,6898,78,1,1,15.299911,0,8.674505,6.354282,1.263427,8.322874,...,0,0,0,1,1,XXXConfid,0,0,0,1


In [210]:
data.isnull().sum()
data

Unnamed: 0,PatientID,Age,Gender,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge,Ethnicityi__0,Ethnicityi__1,Ethnicityi__2,Ethnicityi__3
0,4751,73,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,0,0,1,0,0,XXXConfid,1,0,0,0
1,4752,89,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,0,0,0,1,0,XXXConfid,1,0,0,0
2,4753,73,0,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,1,0,1,0,0,XXXConfid,0,0,0,1
3,4754,74,1,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,0,0,0,0,0,XXXConfid,1,0,0,0
4,4755,89,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,0,1,1,0,0,XXXConfid,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,1,39.121757,0,1.561126,4.049964,6.555306,7.535540,...,0,0,0,0,1,XXXConfid,1,0,0,0
2145,6896,75,0,2,17.857903,0,18.767261,1.360667,2.904662,8.555256,...,0,0,0,0,1,XXXConfid,1,0,0,0
2146,6897,77,0,1,15.476479,0,4.594670,9.886002,8.120025,5.769464,...,0,0,0,0,1,XXXConfid,1,0,0,0
2147,6898,78,1,1,15.299911,0,8.674505,6.354282,1.263427,8.322874,...,0,0,0,1,1,XXXConfid,0,0,0,1


In [211]:
data.drop(["PatientID","DoctorInCharge"], axis=1, inplace=True)
data

Unnamed: 0,Age,Gender,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,FamilyHistoryAlzheimers,...,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,Ethnicityi__0,Ethnicityi__1,Ethnicityi__2,Ethnicityi__3
0,73,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,0,...,0,0,0,1,0,0,1,0,0,0
1,89,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,0,...,0,0,0,0,1,0,1,0,0,0
2,73,0,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,1,...,0,1,0,1,0,0,0,0,0,1
3,74,1,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,0,...,0,0,0,0,0,0,1,0,0,0
4,89,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,0,...,0,0,1,1,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,61,0,1,39.121757,0,1.561126,4.049964,6.555306,7.535540,0,...,1,0,0,0,0,1,1,0,0,0
2145,75,0,2,17.857903,0,18.767261,1.360667,2.904662,8.555256,0,...,0,0,0,0,0,1,1,0,0,0
2146,77,0,1,15.476479,0,4.594670,9.886002,8.120025,5.769464,0,...,0,0,0,0,0,1,1,0,0,0
2147,78,1,1,15.299911,0,8.674505,6.354282,1.263427,8.322874,0,...,0,0,0,0,1,1,0,0,0,1


In [212]:
StandardScaler = StandardScaler()
columns_to_scale = ['Age', 'EducationLevel', 'BMI', 'AlcoholConsumption',
                   'PhysicalActivity',	'DietQuality',	'SleepQuality',
                   'SystolicBP',	'DiastolicBP',	'CholesterolTotal',	'CholesterolLDL',
                   'CholesterolHDL',	'CholesterolTriglycerides',	'MMSE', 'FunctionalAssessment',
                   'ADL']
data[columns_to_scale] = StandardScaler.fit_transform(data[columns_to_scale])
data.columns

Index(['Age', 'Gender', 'EducationLevel', 'BMI', 'Smoking',
       'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes',
       'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP',
       'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion',
       'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
       'Forgetfulness', 'Diagnosis', 'Ethnicityi__0', 'Ethnicityi__1',
       'Ethnicityi__2', 'Ethnicityi__3'],
      dtype='object')

In [214]:
X = data.drop(["Diagnosis"], axis=1).values.astype("float32")
y = data["Diagnosis"].values.astype("float32")

print(f'X shape: {X.shape}, y shape: {y.shape}')

X shape: (2149, 35), y shape: (2149,)


In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)