In [57]:
import pandas as pd
import numpy as np
import os

In [58]:
training_df = pd.read_csv('./Datasets/training_dataset.csv')

In [59]:
training_df.columns

Index(['Unnamed: 0', 'Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimer’s', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk Factor (APOE-ε4 allele)', 'Social Engagement Level',
       'Income Level', 'Stress Levels', 'Urban vs Rural Living',
       'Alzheimer's Diagnosis'],
      dtype='object')

In [60]:
# Set pandas display options to show full content of DataFrame without truncation
pd.set_option('display.max_rows', None)  # Show all rows (optional, for large datasets, adjust as needed)
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width to prevent truncation
pd.set_option('display.max_colwidth', None)  # Show full content in each column

In [61]:
training_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer's Diagnosis
0,3550,USA,59,Male,16,30.1,Medium,Current,Regularly,Yes,No,Normal,Yes,34,Medium,Average,Unhealthy,High,Retired,Widowed,No,Low,High,High,Rural,No
1,2903,USA,57,Male,7,32.5,Medium,Never,Regularly,No,No,Normal,No,60,Low,Poor,Healthy,High,Unemployed,Married,No,Medium,High,Low,Urban,No
2,263,USA,77,Male,16,27.1,Medium,Former,Occasionally,No,No,Normal,No,62,Low,Poor,Healthy,High,Employed,Single,No,Low,Medium,Medium,Urban,Yes
3,2631,USA,94,Female,4,27.7,Low,Current,Occasionally,No,No,High,No,95,Low,Average,Average,Medium,Retired,Married,No,Low,Medium,High,Rural,Yes
4,634,USA,63,Female,15,22.4,Medium,Current,Never,No,No,Normal,No,42,Low,Poor,Unhealthy,Low,Employed,Married,No,Low,Medium,Low,Urban,No


In [62]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            1000 non-null   int64  
 1   Country                               1000 non-null   object 
 2   Age                                   1000 non-null   int64  
 3   Gender                                1000 non-null   object 
 4   Education Level                       1000 non-null   int64  
 5   BMI                                   1000 non-null   float64
 6   Physical Activity Level               1000 non-null   object 
 7   Smoking Status                        1000 non-null   object 
 8   Alcohol Consumption                   1000 non-null   object 
 9   Diabetes                              1000 non-null   object 
 10  Hypertension                          1000 non-null   object 
 11  Cholesterol Level 

In [63]:
training_df = training_df.drop(columns=['Unnamed: 0'])

In [64]:
# drop country column
training_df = training_df.drop(columns=['Country']) 

In [65]:
for column in training_df.columns:
    print(f"Unique values in '{column}':")
    print(training_df[column].unique())
    print("-" * 50)  

Unique values in 'Age':
[59 57 77 94 63 79 74 54 61 60 93 62 84 53 83 72 73 55 92 76 78 91 71 52
 51 90 50 70 80 58 82 68 56 67 87 81 69 85 75 64 88 89 86 65 66]
--------------------------------------------------
Unique values in 'Gender':
['Male' 'Female']
--------------------------------------------------
Unique values in 'Education Level':
[16  7  4 15  5 11  2  9 13 12 18  0 10 17  3  8  1  6 14 19]
--------------------------------------------------
Unique values in 'BMI':
[30.1 32.5 27.1 27.7 22.4 30.7 26.9 31.4 29.  34.4 25.3 24.8 21.5 31.7
 27.3 25.8 30.  33.2 30.9 34.9 22.3 28.2 31.8 21.1 20.4 33.  19.5 34.
 26.1 33.9 23.6 33.8 29.2 32.3 20.3 34.8 20.5 31.6 26.3 25.6 30.8 24.6
 21.2 20.  18.7 21.  23.5 32.1 21.6 33.4 22.2 25.2 18.9 26.2 22.5 28.8
 23.7 21.8 24.4 27.6 20.1 30.6 28.1 19.1 19.3 26.6 22.9 30.5 35.  33.5
 20.6 30.3 27.9 22.8 34.1 29.4 33.3 21.3 19.4 28.4 24.2 22.  28.3 34.7
 27.4 31.5 32.7 19.2 23.2 25.9 26.7 20.2 26.8 20.9 22.7 27.2 27.8 23.4
 30.2 23.3 21.4 31.2 3

In [66]:
# Mapping for ordinal variables
ordinal_mappings = {
    'Physical Activity Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Alcohol Consumption': {'Never': 0, 'Occasionally': 1, 'Regularly': 2},
    'Depression Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Sleep Quality': {'Poor': 0, 'Average': 1, 'Good': 2},
    'Dietary Habits': {'Unhealthy': 0, 'Average': 1, 'Healthy': 2},
    'Air Pollution Exposure': {'Low': 0, 'Medium': 1, 'High': 2},
    'Social Engagement Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Income Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Stress Levels': {'Low': 0, 'Medium': 1, 'High': 2},
    'Cholesterol Level': {'Normal': 0, 'High': 1},
   }

# Apply Label Encoding to all ordinal variables
for column, mapping in ordinal_mappings.items():
    training_df[column] = training_df[column].map(mapping)

# Label Encoding for binary columns (Yes/No) - Family History and Alzheimer's Diagnosis
binary_columns = ["Family History of Alzheimer’s", "Alzheimer's Diagnosis", 'Diabetes', 'Hypertension', 'Genetic Risk Factor (APOE-ε4 allele)']
training_df[binary_columns] = training_df[binary_columns].apply(lambda x: x.map({'Yes': 1, 'No': 0}))

# One-Hot Encode categorical columns with no natural order (e.g., Gender, Urban vs Rural)
columns_to_one_hot_encode = ['Gender', 'Urban vs Rural Living', 'Smoking Status', 'Marital Status', 'Employment Status', ]
training_df= pd.get_dummies(training_df, columns=columns_to_one_hot_encode, drop_first=True)



In [67]:
training_df.head()

Unnamed: 0,Age,Education Level,BMI,Physical Activity Level,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Alzheimer's Diagnosis,Gender_Male,Urban vs Rural Living_Urban,Smoking Status_Former,Smoking Status_Never,Marital Status_Single,Marital Status_Widowed,Employment Status_Retired,Employment Status_Unemployed
0,59,16,30.1,1,2,1,0,0,1,34,1,1,0,2,0,0,2,2,0,True,False,False,False,False,True,True,False
1,57,7,32.5,1,2,0,0,0,0,60,0,0,2,2,0,1,2,0,0,True,True,False,True,False,False,False,True
2,77,16,27.1,1,1,0,0,0,0,62,0,0,2,2,0,0,1,1,1,True,True,True,False,True,False,False,False
3,94,4,27.7,0,1,0,0,1,0,95,0,1,1,1,0,0,1,2,1,False,False,False,False,False,False,True,False
4,63,15,22.4,1,0,0,0,0,0,42,0,0,0,0,0,0,1,0,0,False,True,False,False,False,False,False,False


In [68]:
# Define the features (X) and target (y)
X = training_df.drop(columns=["Alzheimer's Diagnosis"])  # Features
y = training_df["Alzheimer's Diagnosis"]

In [69]:
from sklearn.preprocessing import StandardScaler

# Apply Feature Scaling to numerical columns (Age, BMI, Cognitive Test Score)
numerical_columns = ['Age', 'BMI', 'Cognitive Test Score']
scaler = StandardScaler()

X[numerical_columns] = scaler.fit_transform(X[numerical_columns])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
X.head()

Unnamed: 0,Age,Education Level,BMI,Physical Activity Level,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Gender_Male,Urban vs Rural Living_Urban,Smoking Status_Former,Smoking Status_Never,Marital Status_Single,Marital Status_Widowed,Employment Status_Retired,Employment Status_Unemployed
0,-0.982084,16,0.69086,1,2,1,0,0,1,-1.507817,1,1,0,2,0,0,2,2,True,False,False,False,False,True,True,False
1,-1.134476,7,1.193868,1,2,0,0,0,0,-0.193066,0,0,2,2,0,1,2,0,True,True,False,True,False,False,False,True
2,0.389435,16,0.0621,1,1,0,0,0,0,-0.091931,0,0,2,2,0,0,1,1,True,True,True,False,True,False,False,False
3,1.68476,4,0.187852,0,1,0,0,1,0,1.576791,0,1,1,1,0,0,1,2,False,False,False,False,False,False,True,False
4,-0.677302,15,-0.922956,1,0,0,0,0,0,-1.103278,0,0,0,0,0,0,1,0,False,True,False,False,False,False,False,False


In [75]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Define the models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine (SVM)': SVC(random_state=42, probability=True),
    'XGBoost': XGBClassifier(random_state=42)
}

# Define the metrics to evaluate
scoring_metrics = {
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1-Score': make_scorer(f1_score),
    'ROC-AUC': make_scorer(roc_auc_score, needs_proba=True)
}

# Store the results
results = {}

# Apply cross-validation and compute each metric
for name, model in models.items():
    model_results = {}
    
    for metric_name, metric_scorer in scoring_metrics.items():
        cv_scores = cross_val_score(model, X, y, cv=5, scoring=metric_scorer)  # 5-fold cross-validation
        model_results[metric_name] = cv_scores.mean()  # Store the average score for each metric
    
    results[name] = model_results

# Print the results
print("Model Comparison (Cross-Validation Metrics):")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")



Model Comparison (Cross-Validation Metrics):

Logistic Regression:
  Precision: 0.6402
  Recall: 0.5451
  F1-Score: 0.5882
  ROC-AUC: 0.7768

Random Forest:
  Precision: 0.6765
  Recall: 0.5449
  F1-Score: 0.6012
  ROC-AUC: 0.7684

Support Vector Machine (SVM):
  Precision: 0.6261
  Recall: 0.4628
  F1-Score: 0.5303
  ROC-AUC: 0.7439

XGBoost:
  Precision: 0.6080
  Recall: 0.5680
  F1-Score: 0.5844
  ROC-AUC: 0.7349


In [77]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Max depth of the trees
    'min_samples_split': [2, 5, 10],  # Min samples required to split a node
    'min_samples_leaf': [1, 2, 4],    # Min samples required at leaf nodes
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Setup GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model on training data
grid_search.fit(X, y)

# Get the best parameters from GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Evaluate the best model on the test set
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# ROC-AUC
roc_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

NameError: name 'X_test' is not defined