In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

In [2]:
df = pd.read_csv('../alzheimers_disease_data.csv')

In [3]:
df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [4]:
df = df.drop(['PatientID', 'DoctorInCharge'], axis=1)

In [5]:
print("Missing values:\n", df.isnull().sum())

Missing values:
 Age                          0
Gender                       0
Ethnicity                    0
EducationLevel               0
BMI                          0
Smoking                      0
AlcoholConsumption           0
PhysicalActivity             0
DietQuality                  0
SleepQuality                 0
FamilyHistoryAlzheimers      0
CardiovascularDisease        0
Diabetes                     0
Depression                   0
HeadInjury                   0
Hypertension                 0
SystolicBP                   0
DiastolicBP                  0
CholesterolTotal             0
CholesterolLDL               0
CholesterolHDL               0
CholesterolTriglycerides     0
MMSE                         0
FunctionalAssessment         0
MemoryComplaints             0
BehavioralProblems           0
ADL                          0
Confusion                    0
Disorientation               0
PersonalityChanges           0
DifficultyCompletingTasks    0
Forgetfulness         

In [24]:
# Separate features and target
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
df

In [8]:
# TRAINING

# Separate features and target
X = df.drop('Diagnosis', axis=1)
y = df['Diagnosis']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Model Selection and Training
# Using class_weight='balanced' to handle potential class imbalance
model = RandomForestClassifier(n_estimators=200, 
                             random_state=42, 
                             class_weight='balanced',
                             max_depth=10)
model.fit(X_train_scaled, y_train)

## Model Evaluation
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

## Feature Importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importance.head(10))

## Save the model and scaler
with open('alzheimer_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("\nModel and scaler saved as 'alzheimer_model.pkl' and 'scaler.pkl'")


Model Evaluation:
Accuracy: 0.9395348837209302

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95       278
           1       0.94      0.89      0.91       152

    accuracy                           0.94       430
   macro avg       0.94      0.93      0.93       430
weighted avg       0.94      0.94      0.94       430


Confusion Matrix:
 [[269   9]
 [ 17 135]]

Top 10 Important Features:
                     Feature  Importance
23      FunctionalAssessment    0.194282
26                       ADL    0.172172
22                      MMSE    0.136011
24          MemoryComplaints    0.097228
25        BehavioralProblems    0.050276
8                DietQuality    0.027288
20            CholesterolHDL    0.027000
21  CholesterolTriglycerides    0.026943
4                        BMI    0.026604
9               SleepQuality    0.026432

Model and scaler saved as 'alzheimer_model.pkl' and 'scaler.pkl'


In [None]:
# TESTING FUNCTION

def predict_with_probability(new_data):
    # Convert to DataFrame and scale
    new_data_df = pd.DataFrame([new_data])
    new_data_scaled = scaler.transform(new_data_df)
    
    # Make prediction and get probabilities
    prediction = model.predict(new_data_scaled)[0]
    probability = model.predict_proba(new_data_scaled)[0][1]  # Probability of class 1 (Alzheimer's)
    
    return prediction, probability

In [22]:
# USAGE OF TESTING FUNC

# NEGATIVE - DARI DIHSEEK
input_data_dih_seek = {
    'Age': 75,
    'Gender': 1,
    'Ethnicity': 2,
    'EducationLevel': 1,
    'BMI': 26.5,
    'Smoking': 0,
    'AlcoholConsumption': 10.2,
    'PhysicalActivity': 5.8,
    'DietQuality': 6.3,
    'SleepQuality': 7.5,
    'FamilyHistoryAlzheimers': 0,
    'CardiovascularDisease': 0,
    'Diabetes': 0,
    'Depression': 0,
    'HeadInjury': 0,
    'Hypertension': 1,
    'SystolicBP': 135,
    'DiastolicBP': 85,
    'CholesterolTotal': 220.5,
    'CholesterolLDL': 120.3,
    'CholesterolHDL': 55.2,
    'CholesterolTriglycerides': 180.4,
    'MMSE': 24.5,
    'FunctionalAssessment': 7.2,
    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
    'ADL': 2.1,
    'Confusion': 0,
    'Disorientation': 0,
    'PersonalityChanges': 0,
    'DifficultyCompletingTasks': 1,
    'Forgetfulness': 1
}

# POSITIVE - DARI CSV
input_data_61 = {
    'Age': 61,
    'Gender': 0,
    'Ethnicity': 0,
    'EducationLevel': 0,
    'BMI': 19.09,
    'Smoking': 0, 
    'AlcoholConsumption': 3.732,
    'PhysicalActivity': 0.37,
    'DietQuality': 4.125,
    'SleepQuality': 5.11,
    'FamilyHistoryAlzheimers': 0, 
    'CardiovascularDisease': 0, 
    'Diabetes': 1, 
    'Depression': 0, 
    'HeadInjury': 0, 
    'Hypertension': 0, 
    'SystolicBP': 171,
    'DiastolicBP': 115,
    'CholesterolTotal': 202.303,
    'CholesterolLDL': 128.555,
    'CholesterolHDL': 67.64,
    'CholesterolTriglycerides': 119.334,
    'MMSE': 4.95,
    'FunctionalAssessment': 3.18,
    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
    'ADL': 4.42,
    'Confusion': 0, 
    'Disorientation': 1, 
    'PersonalityChanges': 0, 
    'DifficultyCompletingTasks': 1, 
    'Forgetfulness': 1,
}

# POSITIVE - DARI CSV
input_data_75 = {
    'Age': 75,
    'Gender': 0,
    'Ethnicity': 0,
    'EducationLevel': 0,
    'BMI': 18.776,
    'Smoking': 0, 
    'AlcoholConsumption': 13.723,
    'PhysicalActivity': 4.469,
    'DietQuality': 8.341,
    'SleepQuality': 4.213,
    'FamilyHistoryAlzheimers': 0, 
    'CardiovascularDisease': 0, 
    'Diabetes': 0, 
    'Depression': 0, 
    'HeadInjury': 0, 
    'Hypertension': 0, 
    'SystolicBP': 117,
    'DiastolicBP': 63,
    'CholesterolTotal': 151.383,
    'CholesterolLDL': 69.623,
    'CholesterolHDL': 77.346,
    'CholesterolTriglycerides': 210.5,
    'MMSE': 10.139,
    'FunctionalAssessment': 3.401,
    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
    'ADL': 4.517,
    'Confusion': 1, 
    'Disorientation': 0, 
    'PersonalityChanges': 0, 
    'DifficultyCompletingTasks': 0, 
    'Forgetfulness': 1,
}

# NEGATIVE - DARI CSV
input_data_78 = {
    'Age': 78,
    'Gender': 0,
    'Ethnicity': 0,
    'EducationLevel': 2,
    'BMI': 22.463,
    'Smoking': 1, 
    'AlcoholConsumption': 19.300,
    'PhysicalActivity': 3.834,
    'DietQuality': 8.279,
    'SleepQuality': 8.312,
    'FamilyHistoryAlzheimers': 0, 
    'CardiovascularDisease': 0, 
    'Diabetes': 1, 
    'Depression': 0, 
    'HeadInjury': 0, 
    'Hypertension': 1, 
    'SystolicBP': 165,
    'DiastolicBP': 97,
    'CholesterolTotal': 254.586,
    'CholesterolLDL': 132.960,
    'CholesterolHDL': 39.009,
    'CholesterolTriglycerides': 344.448,
    'MMSE': 21.205,
    'FunctionalAssessment': 5.568,
    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
    'ADL': 5.467,
    'Confusion': 0, 
    'Disorientation': 1, 
    'PersonalityChanges': 0, 
    'DifficultyCompletingTasks': 1, 
    'Forgetfulness': 1,
}

prediction, probability = predict_with_probability(input_data_78) # Ganti aja mau pilih yg mana
print(f"\nExample Prediction: {prediction} (0 = No Alzheimer's, 1 = Alzheimer's)")
print(f"Probability of Alzheimer's: {probability:.2%}")


Example Prediction: 0 (0 = No Alzheimer's, 1 = Alzheimer's)
Probability of Alzheimer's: 5.05%
