In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

In [2]:
file_path = './alzheimer_disease_dataset/alzheimers_disease_data.csv'
data = pd.read_csv(file_path)

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

A base de dados não apresentou nenhum atributo com valor nulo e a única coluna apresentada como do tipo objeto, a 'DoctorInCharge', não precisa ser utilizada para a classificação.

In [4]:
data.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [5]:
print(data.columns.tolist())

['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI', 'Smoking', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'FamilyHistoryAlzheimers', 'CardiovascularDisease', 'Diabetes', 'Depression', 'HeadInjury', 'Hypertension', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'MemoryComplaints', 'BehavioralProblems', 'ADL', 'Confusion', 'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks', 'Forgetfulness', 'Diagnosis', 'DoctorInCharge']


Temos uma base de dados com 35 colunas, sendo estas listadas acima. 

In [6]:
data.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,5825.0,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,620.507185,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,4751.0,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,5288.0,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,5825.0,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,6362.0,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


# Preparando os dados

In [7]:
# removendo colunas desnecessárias
data_cleaned = data.drop(columns=['PatientID', 'DoctorInCharge'])
data_cleaned.head()

Unnamed: 0,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,SleepQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
0,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,9.025679,...,6.518877,0,0,1.725883,0,0,0,1,0,0
1,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,7.151293,...,7.118696,0,0,2.592424,0,0,0,0,1,0
2,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,9.673574,...,5.895077,0,0,7.119548,0,1,0,1,0,0
3,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,8.392554,...,8.965106,0,1,6.481226,0,0,0,0,0,0
4,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,5.597238,...,6.045039,0,0,0.014691,0,0,1,1,0,0


In [8]:
# separando recursos e alvo
X = data_cleaned.drop(columns=['Diagnosis']) # recursos -> todas as colunas menos a Diagnosis
y = data_cleaned['Diagnosis'] # alvo -> a coluna Diagnosis

# dividindo os dados em conjuntos de treinamento e de teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# padronizando os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train);
X_test_scaled = scaler.transform(X_test);

# Descobrindo as melhores features para o treinamento do modelo

In [9]:
# selecionando as K melhores características
k = 10
selector = SelectKBest(score_func=chi2, k=k)
X_new = selector.fit_transform(X, y)

# exibindo as colunas selecionadas
selected_features = X.columns[selector.get_support()]
print(f"As {k} melhores características são:")
print(selected_features)

As 10 melhores características são:
Index(['BMI', 'SleepQuality', 'CholesterolLDL', 'CholesterolHDL',
       'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment',
       'MemoryComplaints', 'BehavioralProblems', 'ADL'],
      dtype='object')


# Treinando os modelos e testando o desempenho

## Treinando modelo com SVM

In [10]:
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_scaled, y_train)

In [11]:
# predição no conjunto de testes
y_pred = svm_model.predict(X_test_scaled)

# avaliando o desempenho do modelo treinado
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Relatório de Classificação no Modelo SVM:\n", report)
print("Matriz de Confusão no modelo SVM:\n", conf_matrix)

Relatório de Classificação no Modelo SVM:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86       277
           1       0.78      0.70      0.74       153

    accuracy                           0.82       430
   macro avg       0.81      0.79      0.80       430
weighted avg       0.82      0.82      0.82       430

Matriz de Confusão no modelo SVM:
 [[246  31]
 [ 46 107]]


K bests

In [12]:
# dividindo os dados em conjuntos de treinamento e de teste
X_best_train, X_best_test, y_best_train, y_best_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

# padronizando os dados
scaler = StandardScaler()
X_best_train_scaled = scaler.fit_transform(X_best_train);
X_best_test_scaled = scaler.transform(X_best_test);

# Treinando com o SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_best_train_scaled, y_train)

# predição no conjunto de testes
y_best_pred = svm_model.predict(X_best_test_scaled)

# avaliando o desempenho do modelo treinado
report = classification_report(y_best_test, y_best_pred)
conf_matrix = confusion_matrix(y_best_test, y_best_pred)

print("Relatório de Classificação no Modelo SVM com as Melhores Features:\n", report)
print("Matriz de Confusão no modelo SVM com as Melhores Features:\n", conf_matrix)


Relatório de Classificação no Modelo SVM com as Melhores Features:
               precision    recall  f1-score   support

           0       0.85      0.89      0.87       277
           1       0.78      0.73      0.75       153

    accuracy                           0.83       430
   macro avg       0.82      0.81      0.81       430
weighted avg       0.83      0.83      0.83       430

Matriz de Confusão no modelo SVM com as Melhores Features:
 [[246  31]
 [ 42 111]]


## Treinando modelo com Random Forest

In [13]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [14]:
# predição no conjunto de dados
y_pred_rf = rf_model.predict(X_test_scaled)

# avaliando o desempenho do modelo treinado
report = classification_report(y_test, y_pred_rf)
confusion_matrix_rf = confusion_matrix(y_test, y_pred_rf)

print("Relatório de Classificação no Modelo Random Forest:\n", report)
print("Matriz de Confusão no Modelo Random Forest:\n", conf_matrix)

Relatório de Classificação no Modelo Random Forest:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       277
           1       0.96      0.82      0.89       153

    accuracy                           0.93       430
   macro avg       0.94      0.90      0.92       430
weighted avg       0.93      0.93      0.92       430

Matriz de Confusão no Modelo Random Forest:
 [[246  31]
 [ 42 111]]


K bests

In [15]:
# Treinando com o Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_best_train_scaled, y_train)


# predição no conjunto de dados
y_best_pred_rf = rf_model.predict(X_best_test_scaled)

# avaliando o desempenho do modelo treinado
report = classification_report(y_best_test, y_best_pred_rf)
confusion_matrix_rf = confusion_matrix(y_best_test, y_best_pred_rf)

print("Relatório de Classificação no Modelo Random Forest com as Melhores Features:\n", report)
print("Matriz de Confusão no Modelo Random Forest com as Melhores Features:\n", conf_matrix)

Relatório de Classificação no Modelo Random Forest com as Melhores Features:
               precision    recall  f1-score   support

           0       0.94      0.98      0.96       277
           1       0.96      0.88      0.92       153

    accuracy                           0.95       430
   macro avg       0.95      0.93      0.94       430
weighted avg       0.95      0.95      0.95       430

Matriz de Confusão no Modelo Random Forest com as Melhores Features:
 [[246  31]
 [ 42 111]]


## Treinando modelo com KNN

In [16]:
# Excluindo a coluna 'PatientID' e 'DoctorInCharge' se existirem, pois não são úteis para a predição
data = data.drop(columns=['PatientID', 'DoctorInCharge'], errors='ignore')

# Codificando variáveis categóricas
data = pd.get_dummies(data, drop_first=True)

In [17]:
X =  data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [20]:
k = 11
knn = KNeighborsClassifier(n_neighbors=k)

In [21]:
knn.fit(X_train, y_train)

In [22]:
y_pred_knn = knn.predict(X_test)

In [26]:
accuracy = accuracy_score(y_test, y_pred)
print("Acurácia do KNN: ", accuracy)

Relatório de Classificação no Modelo Random Forest com as Melhores Features:
               precision    recall  f1-score   support

           0       0.75      0.92      0.83       277
           1       0.76      0.44      0.56       153

    accuracy                           0.75       430
   macro avg       0.75      0.68      0.69       430
weighted avg       0.75      0.75      0.73       430

Matriz de Confusão no Modelo Random Forest com as Melhores Features:
 [[255  22]
 [ 85  68]]


K-bests

In [29]:
knn.fit(X_best_train, y_best_train)
y_pred = knn.predict(X_best_test)
accuracy_best = accuracy_score(y_best_test, y_best_pred)
print("Acurácia do KNN com as Kbest: ", accuracy_best)

Acurácia do KNN com as Kbest:  0.8302325581395349
