In [66]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [21]:
df = pd.read_csv('alzheimers_disease_data.csv')

In [3]:
df

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,6895,61,0,0,1,39.121757,0,1.561126,4.049964,6.555306,...,0,0,4.492838,1,0,0,0,0,1,XXXConfid
2145,6896,75,0,0,2,17.857903,0,18.767261,1.360667,2.904662,...,0,1,9.204952,0,0,0,0,0,1,XXXConfid
2146,6897,77,0,0,1,15.476479,0,4.594670,9.886002,8.120025,...,0,0,5.036334,0,0,0,0,0,1,XXXConfid
2147,6898,78,1,3,1,15.299911,0,8.674505,6.354282,1.263427,...,0,0,3.785399,0,0,0,0,1,1,XXXConfid


In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
PatientID,4751,4752,4753,4754,4755
Age,73,89,73,74,89
Gender,0,0,0,1,0
Ethnicity,0,0,3,0,0
EducationLevel,2,0,1,1,0
BMI,22.927749,26.827681,17.795882,33.800817,20.716974
Smoking,0,0,0,1,0
AlcoholConsumption,13.297218,4.542524,19.555085,12.209266,18.454356
PhysicalActivity,6.327112,7.619885,7.844988,8.428001,6.310461
DietQuality,1.347214,0.518767,1.826335,7.435604,0.795498


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PatientID,2149.0,5825.0,620.507185,4751.0,5288.0,5825.0,6362.0,6899.0
Age,2149.0,74.908795,8.990221,60.0,67.0,75.0,83.0,90.0
Gender,2149.0,0.506282,0.500077,0.0,0.0,1.0,1.0,1.0
Ethnicity,2149.0,0.697534,0.996128,0.0,0.0,0.0,1.0,3.0
EducationLevel,2149.0,1.286645,0.904527,0.0,1.0,1.0,2.0,3.0
BMI,2149.0,27.655697,7.217438,15.008851,21.611408,27.823924,33.869778,39.992767
Smoking,2149.0,0.288506,0.453173,0.0,0.0,0.0,1.0,1.0
AlcoholConsumption,2149.0,10.039442,5.75791,0.002003,5.13981,9.934412,15.157931,19.989293
PhysicalActivity,2149.0,4.920202,2.857191,0.003616,2.570626,4.766424,7.427899,9.987429
DietQuality,2149.0,4.993138,2.909055,0.009385,2.458455,5.076087,7.558625,9.998346


In [7]:
sum(df.duplicated())

0

In [64]:
df.DoctorInCharge.value_counts()

DoctorInCharge
XXXConfid    2149
Name: count, dtype: int64

In [8]:
# Training
target_column = 'Diagnosis'  
drop_columns = ['Patient ID', 'Doctor', 'Diagnosis']  
feature_columns = ['Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL']

# Normalize then Standardize
min_max_scaler = MinMaxScaler()
df[feature_columns] = min_max_scaler.fit_transform(df[feature_columns])

standard_scaler = StandardScaler()
df[feature_columns] = standard_scaler.fit_transform(df[feature_columns])

# Save scalers
joblib.dump(min_max_scaler, 'min_max_scaler.pkl')
joblib.dump(standard_scaler, 'standard_scaler.pkl')

# 🧪 Split the data
X = df[feature_columns]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ⚙️ Train the XGBoost model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

# 🧠 Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 💾 Save the model
joblib.dump(model, 'xgboost_model.pkl')

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       277
           1       0.81      0.58      0.67       153

    accuracy                           0.80       430
   macro avg       0.81      0.75      0.77       430
weighted avg       0.80      0.80      0.79       430



['xgboost_model.pkl']

In [53]:
# 🧩 Columns
target_column = 'Diagnosis'
drop_columns = ['Patient ID', 'Doctor', 'Diagnosis']

numerical_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality',
    'SleepQuality', 'SystolicBP', 'DiastolicBP', 'CholesterolTotal',
    'CholesterolLDL', 'CholesterolHDL', 'CholesterolTriglycerides',
    'MMSE', 'FunctionalAssessment', 'ADL'
]

boolean_columns = [
    'Smoking', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
    'Diabetes', 'Depression', 'HeadInjury', 'Hypertension',
    'MemoryComplaints', 'BehavioralProblems', 'Confusion',
    'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
    'Forgetfulness'
]

categorical_columns = ['Gender', 'Ethnicity', 'EducationLevel']

# ✅ Encode Booleans
df[boolean_columns] = df[boolean_columns].replace({'No': 0, 'Yes': 1})

# ✅ Label Encode Categoricals
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
joblib.dump(label_encoders, 'label_encoders.pkl')

# ✅ Combine features
feature_columns = numerical_columns + boolean_columns + categorical_columns

# 🔢 Normalize then Standardize
min_max_scaler = MinMaxScaler()
df[numerical_columns] = min_max_scaler.fit_transform(df[numerical_columns])

standard_scaler = StandardScaler()
df[numerical_columns] = standard_scaler.fit_transform(df[numerical_columns])

# 💾 Save scalers
joblib.dump(min_max_scaler, 'min_max_scaler_v2.pkl')
joblib.dump(standard_scaler, 'standard_scaler_v2.pkl')

# 🧪 Split
X = df[feature_columns]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ⚙️ Train
model = XGBClassifier(n_estimators=120, learning_rate=0.05, max_depth=8, use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# 📊 Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 💾 Save new model
joblib.dump(model, 'xgboost_model_v2.pkl')

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.79      0.94      0.86       277
           1       0.83      0.56      0.67       153

    accuracy                           0.80       430
   macro avg       0.81      0.75      0.76       430
weighted avg       0.81      0.80      0.79       430



['xgboost_model_v2.pkl']

In [54]:
input_data = {
    'Age': 75,
    'BMI': 18.776,
    'AlcoholConsumption': 13.723,
    'PhysicalActivity': 4.469,
    'DietQuality': 8.341,
    'SleepQuality': 4.213,
    'SystolicBP': 117,
    'DiastolicBP': 63,
    'CholesterolTotal': 151.383,
    'CholesterolLDL': 69.623,
    'CholesterolHDL': 77.346,
    'CholesterolTriglycerides': 210.5,
    'MMSE': 10.139,
    'FunctionalAssessment': 3.401,
    'ADL': 4.517,

    'Smoking': 0, 
    'FamilyHistoryAlzheimers': 0, 
    'CardiovascularDisease': 0, 
    'Diabetes': 0, 
    'Depression': 0, 
    'HeadInjury': 0, 
    'Hypertension': 0, 
    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
    'Confusion': 1, 
    'Disorientation': 0, 
    'PersonalityChanges': 0, 
    'DifficultyCompletingTasks': 0, 
    'Forgetfulness': 1,

    'Gender': 0,
    'Ethnicity': 0,
    'EducationLevel': 0
}

# Convert to DataFrame
df_input = pd.DataFrame([input_data])

# Columns based on your training pipeline
numerical_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity',
    'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP',
    'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
    'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL'
]

categorical_columns = [
    'Smoking', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
    'Diabetes', 'Depression', 'HeadInjury', 'Hypertension',
    'MemoryComplaints', 'BehavioralProblems', 'Confusion',
    'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
    'Forgetfulness', 'Gender', 'Ethnicity', 'EducationLevel'
]
model = joblib.load('xgboost_model_v2.pkl')
min_max_scaler = joblib.load('min_max_scaler_v2.pkl')
standard_scaler = joblib.load('standard_scaler_v2.pkl')
label_encoders = joblib.load('label_encoders.pkl')

df_num_scaled = min_max_scaler.transform(df_input[numerical_columns])
df_num_scaled = standard_scaler.transform(df_num_scaled)

# Step 2: Encode categorical columns if necessary (skipped here since you already use 0/1)
# If categorical variables were label encoded during training, do this:
# for col in categorical_columns:
#     le = label_encoders[col]
#     df_input[col] = le.transform(df_input[col])

# Step 3: Combine preprocessed features
X_processed = np.hstack([df_num_scaled, df_input[categorical_columns].values])

# Step 4: Predict
pred = model.predict(X_processed)[0]
proba = model.predict_proba(X_processed)[0]

# Output
print(f"Predicted class: {pred} (0 = No Alzheimer’s, 1 = Diagnosed)")
print(f"Probability of class 0: {proba[0]:.4f}")
print(f"Probability of class 1: {proba[1]:.4f}")

Predicted class: 0 (0 = No Alzheimer’s, 1 = Diagnosed)
Probability of class 0: 0.5679
Probability of class 1: 0.4321




In [58]:
# TRAINING + BEHAVIOR + MEMORY COMPLAINTS
target_column = 'Diagnosis'  
drop_columns = ['PatientID', 'DoctorInCharge', 'Diagnosis']  

# 🔧 Feature columns (updated)
feature_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
    'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
    'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL',
    'MemoryComplaints', 'BehavioralProblems'  # ✅ Added
]

# 🔁 Encode booleans (assuming 'Yes'/'No' strings)
df['MemoryComplaints'] = df['MemoryComplaints'].map({'No': 0, 'Yes': 1})
df['BehavioralProblems'] = df['BehavioralProblems'].map({'No': 0, 'Yes': 1})

# ⚖️ Normalize and standardize
min_max_scaler = MinMaxScaler()
df[feature_columns] = min_max_scaler.fit_transform(df[feature_columns])

standard_scaler = StandardScaler()
df[feature_columns] = standard_scaler.fit_transform(df[feature_columns])

# 💾 Save the scalers
joblib.dump(min_max_scaler, 'min_max_scaler_v3.pkl') 
joblib.dump(standard_scaler, 'standard_scaler_v3.pkl') 

# 🧪 Split the data
X = df[feature_columns]
y = df[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ⚙️ Train the model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)
model.fit(X_train, y_train)

# 🧠 Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# 💾 Save the model
joblib.dump(model, 'xgboost_model_v3.pkl')

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       277
           1       0.81      0.58      0.67       153

    accuracy                           0.80       430
   macro avg       0.81      0.75      0.77       430
weighted avg       0.80      0.80      0.79       430



  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


['xgboost_model_v3.pkl']

In [74]:
# Load model and scalers
model = joblib.load('xgboost_model_v3.pkl')
min_max_scaler = joblib.load('min_max_scaler_v3.pkl')
standard_scaler = joblib.load('standard_scaler_v3.pkl')

# Define a sample input
input_data = {
    'Age': 71,
    'BMI': 16.608,
    'AlcoholConsumption': 4.867,
    'PhysicalActivity': 6.606,
    'DietQuality': 2.4,
    'SleepQuality': 8.85,
    'SystolicBP': 94,
    'DiastolicBP': 65,
    'CholesterolTotal': 158.964,
    'CholesterolLDL': 62.788,
    'CholesterolHDL': 70.804,
    'CholesterolTriglycerides': 272.666,
    'MMSE': 24.719,
    'FunctionalAssessment': 9.8104,
    'ADL': 0.85,

    'MemoryComplaints': 0,
    'BehavioralProblems': 0,
}

# Convert to DataFrame
input_df = pd.DataFrame([input_data])

# Define the correct column order
feature_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
    'SystolicBP', 'DiastolicBP', 'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
    'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL',
    'MemoryComplaints', 'BehavioralProblems'
]

# Apply scaling
input_scaled = min_max_scaler.transform(input_df[feature_columns])
input_scaled = standard_scaler.transform(input_scaled)

# Predict
prediction = model.predict(input_scaled)
print("Predicted Diagnosis:", prediction[0])

probability = model.predict_proba(input_scaled)
print("Prediction Probabilities:", probability)

Predicted Diagnosis: 0
Prediction Probabilities: [[0.70603704 0.29396293]]


