In [79]:
import pandas as pd
import numpy as np
import os

In [80]:
training_df = pd.read_csv('./Datasets/alzheimers_prediction_dataset_usa.csv')

In [81]:
training_df.columns

Index(['Unnamed: 0', 'Country', 'Age', 'Gender', 'Education Level', 'BMI',
       'Physical Activity Level', 'Smoking Status', 'Alcohol Consumption',
       'Diabetes', 'Hypertension', 'Cholesterol Level',
       'Family History of Alzheimer’s', 'Cognitive Test Score',
       'Depression Level', 'Sleep Quality', 'Dietary Habits',
       'Air Pollution Exposure', 'Employment Status', 'Marital Status',
       'Genetic Risk Factor (APOE-ε4 allele)', 'Social Engagement Level',
       'Income Level', 'Stress Levels', 'Urban vs Rural Living',
       'Alzheimer's Diagnosis'],
      dtype='object')

In [82]:
# Set pandas display options to show full content of DataFrame without truncation
pd.set_option('display.max_rows', None)  # Show all rows (optional, for large datasets, adjust as needed)
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', None)  # Adjust width to prevent truncation
pd.set_option('display.max_colwidth', None)  # Show full content in each column

In [83]:
training_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Age,Gender,Education Level,BMI,Physical Activity Level,Smoking Status,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Employment Status,Marital Status,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Urban vs Rural Living,Alzheimer's Diagnosis
0,1,USA,61,Female,13,33.1,Medium,Former,Never,No,Yes,High,No,51,High,Good,Healthy,High,Employed,Married,No,High,Low,Low,Urban,No
1,2,USA,55,Male,16,29.9,Medium,Former,Occasionally,No,No,High,No,48,Medium,Poor,Healthy,Low,Retired,Widowed,No,Low,Medium,Low,Rural,No
2,3,USA,72,Male,2,31.5,Low,Former,Regularly,No,No,Normal,No,75,High,Poor,Average,High,Employed,Widowed,No,High,High,Medium,Urban,No
3,4,USA,57,Female,0,30.7,Low,Current,Regularly,No,Yes,High,Yes,58,High,Good,Unhealthy,Low,Employed,Single,No,High,Medium,Medium,Rural,No
4,5,USA,69,Female,12,19.7,High,Former,Regularly,No,No,High,No,83,Medium,Good,Average,Medium,Retired,Married,No,Low,Medium,Low,Urban,No


In [84]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3616 entries, 0 to 3615
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Unnamed: 0                            3616 non-null   int64  
 1   Country                               3616 non-null   object 
 2   Age                                   3616 non-null   int64  
 3   Gender                                3616 non-null   object 
 4   Education Level                       3616 non-null   int64  
 5   BMI                                   3616 non-null   float64
 6   Physical Activity Level               3616 non-null   object 
 7   Smoking Status                        3616 non-null   object 
 8   Alcohol Consumption                   3616 non-null   object 
 9   Diabetes                              3616 non-null   object 
 10  Hypertension                          3616 non-null   object 
 11  Cholesterol Level

In [85]:
training_df = training_df.drop(columns=['Unnamed: 0'])

In [86]:
# drop country column
training_df = training_df.drop(columns=['Country']) 

In [87]:
for column in training_df.columns:
    print(f"Unique values in '{column}':")
    print(training_df[column].unique())
    print("-" * 50)  

Unique values in 'Age':
[61 55 72 57 69 65 93 90 89 74 86 64 59 79 56 84 51 87 66 71 80 67 58 63
 85 54 53 75 73 82 91 83 94 92 52 76 81 50 77 88 68 78 60 70 62]
--------------------------------------------------
Unique values in 'Gender':
['Female' 'Male']
--------------------------------------------------
Unique values in 'Education Level':
[13 16  2  0 12 10 15 17  1  9  7  8 19  5  6  3 14 18 11  4]
--------------------------------------------------
Unique values in 'BMI':
[33.1 29.9 31.5 30.7 19.7 19.5 23.6 29.4 32.1 33.4 34.2 24.4 22.3 23.1
 21.3 20.8 27.5 31.  34.1 19.2 18.8 27.6 20.5 21.8 19.1 27.9 26.6 20.2
 22.4 24.5 25.9 33.6 23.9 23.7 20.1 24.3 29.1 30.4 31.9 30.6 27.3 29.5
 30.8 24.2 23.4 25.6 20.6 25.2 23.5 32.4 27.  32.7 29.3 20.  28.  33.9
 34.7 20.7 34.8 24.8 30.1 28.2 23.3 19.3 31.4 28.9 24.1 31.2 23.2 29.
 31.8 28.5 27.1 19.9 33.5 34.9 22.6 32.3 19.6 28.4 21.7 31.3 22.8 18.6
 22.  30.  29.2 24.6 32.  32.8 26.9 33.  25.8 19.  28.3 33.8 18.9 19.8
 26.3 30.3 26.1 28.1 2

In [88]:
# Mapping for ordinal variables
ordinal_mappings = {
    'Physical Activity Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Alcohol Consumption': {'Never': 0, 'Occasionally': 1, 'Regularly': 2},
    'Depression Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Sleep Quality': {'Poor': 0, 'Average': 1, 'Good': 2},
    'Dietary Habits': {'Unhealthy': 0, 'Average': 1, 'Healthy': 2},
    'Air Pollution Exposure': {'Low': 0, 'Medium': 1, 'High': 2},
    'Social Engagement Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Income Level': {'Low': 0, 'Medium': 1, 'High': 2},
    'Stress Levels': {'Low': 0, 'Medium': 1, 'High': 2},
    'Cholesterol Level': {'Normal': 0, 'High': 1},
   }

# Apply Label Encoding to all ordinal variables
for column, mapping in ordinal_mappings.items():
    training_df[column] = training_df[column].map(mapping)

# Label Encoding for binary columns (Yes/No) - Family History and Alzheimer's Diagnosis
binary_columns = ["Family History of Alzheimer’s", "Alzheimer's Diagnosis", 'Diabetes', 'Hypertension', 'Genetic Risk Factor (APOE-ε4 allele)']
training_df[binary_columns] = training_df[binary_columns].apply(lambda x: x.map({'Yes': 1, 'No': 0}))

# One-Hot Encode categorical columns with no natural order (e.g., Gender, Urban vs Rural)
columns_to_one_hot_encode = ['Gender', 'Urban vs Rural Living', 'Smoking Status', 'Marital Status', 'Employment Status', ]
training_df= pd.get_dummies(training_df, columns=columns_to_one_hot_encode, drop_first=True)



In [89]:
training_df.head()

Unnamed: 0,Age,Education Level,BMI,Physical Activity Level,Alcohol Consumption,Diabetes,Hypertension,Cholesterol Level,Family History of Alzheimer’s,Cognitive Test Score,Depression Level,Sleep Quality,Dietary Habits,Air Pollution Exposure,Genetic Risk Factor (APOE-ε4 allele),Social Engagement Level,Income Level,Stress Levels,Alzheimer's Diagnosis,Gender_Male,Urban vs Rural Living_Urban,Smoking Status_Former,Smoking Status_Never,Marital Status_Single,Marital Status_Widowed,Employment Status_Retired,Employment Status_Unemployed
0,61,13,33.1,1,0,0,1,1,0,51,2,2,2,2,0,2,0,0,0,False,True,True,False,False,False,False,False
1,55,16,29.9,1,1,0,0,1,0,48,1,0,2,0,0,0,1,0,0,True,False,True,False,False,True,True,False
2,72,2,31.5,0,2,0,0,0,0,75,2,0,1,2,0,2,2,1,0,True,True,True,False,False,True,False,False
3,57,0,30.7,0,2,0,1,1,1,58,2,2,0,0,0,2,1,1,0,False,False,False,False,True,False,False,False
4,69,12,19.7,2,2,0,0,1,0,83,1,2,1,1,0,0,1,0,0,False,True,True,False,False,False,True,False


In [90]:
# Define the features (X) and target (y)
X = training_df.drop(columns=["Alzheimer's Diagnosis"])  # Features
y = training_df["Alzheimer's Diagnosis"]

In [91]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Assuming you've done the train-test split already:
# X_train, X_test, y_train, y_test

# Scale the features (Important for SVM and other distance-based models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine (SVM)': SVC(random_state=42, probability=True),
    'XGBoost': XGBClassifier(random_state=42)
}

# Define the metrics to evaluate
scoring_metrics = {
    'Precision': make_scorer(precision_score),
    'Recall': make_scorer(recall_score),
    'F1-Score': make_scorer(f1_score),
    'ROC-AUC': make_scorer(roc_auc_score, needs_proba=True)
}

# Store the results
results = {}

# Apply cross-validation and compute each metric
for name, model in models.items():
    model_results = {}
    
    for metric_name, metric_scorer in scoring_metrics.items():
        if metric_name == 'ROC-AUC':  # ROC-AUC needs the probability predictions
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=metric_scorer, n_jobs=-1)
        else:
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=metric_scorer, n_jobs=-1)
        model_results[metric_name] = cv_scores.mean()  # Store the average score for each metric
    
    results[name] = model_results

# Print the results
print("Model Comparison (Cross-Validation Metrics):")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")



Model Comparison (Cross-Validation Metrics):

Logistic Regression:
  Precision: 0.6506
  Recall: 0.5632
  F1-Score: 0.6033
  ROC-AUC: 0.7810

Random Forest:
  Precision: 0.6403
  Recall: 0.5525
  F1-Score: 0.5929
  ROC-AUC: 0.7767

Support Vector Machine (SVM):
  Precision: 0.6376
  Recall: 0.5606
  F1-Score: 0.5960
  ROC-AUC: 0.7652

XGBoost:
  Precision: 0.5970
  Recall: 0.5676
  F1-Score: 0.5817
  ROC-AUC: 0.7464


In [92]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Assuming you've done the train-test split already:
# X_train, X_test, y_train, y_test

# Define the Random Forest model
rf = RandomForestClassifier(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [None, 10, 20, 30],  # Max depth of the trees
    'min_samples_split': [2, 5, 10],  # Min samples required to split a node
    'min_samples_leaf': [1, 2, 4],    # Min samples required at leaf nodes
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Get the best model after hyperparameter tuning
best_rf = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_rf.predict(X_test)

# Print the classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Calculate and print the accuracy score on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# Calculate and print the ROC-AUC score on the test set
roc_auc = roc_auc_score(y_test, best_rf.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.4s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.8s
[CV] END bootstrap=True, max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; tot

In [93]:
# Define the Logistic Regression model
log_reg = LogisticRegression(random_state=42)

# Define the hyperparameters to tune
param_grid = {
    'C': [0.1, 1, 10, 100],  # Inverse of regularization strength
    'penalty': ['l1', 'l2'],  # Type of regularization
    'solver': ['liblinear', 'saga'],  # Solver for optimization
    'max_iter': [100, 200, 300]  # Maximum number of iterations
}

# Setup GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters from GridSearchCV
print(f"Best Hyperparameters: {grid_search.best_params_}")

# Get the best model after hyperparameter tuning
best_log_reg = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred = best_log_reg.predict(X_test)

# Print the classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Calculate and print the accuracy score on the test set
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# Calculate and print the ROC-AUC score on the test set
roc_auc = roc_auc_score(y_test, best_log_reg.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=0.1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ..C=0.1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s




[CV] END ..C=0.1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .......C=0.1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END .......C=0.1, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s




[CV] END .......C=0.1, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END .......C=0.1, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=0.1, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s




[CV] END .......C=0.1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .......C=0.1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .......C=0.1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .......C=0.1, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END .......C=0.1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .......C=0.1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END ..C=0.1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END .......C=0.1, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END ..C=0.1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ..C=0.1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END .......C=0.1, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END ..C=0.1, max_iter=3



[CV] END .......C=0.1, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END .......C=0.1, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END .......C=0.1, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END .......C=0.1, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END .......C=0.1, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END .......C=0.1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END .......C=0.1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s




[CV] END .......C=0.1, max_iter=300, penalty=l2, solver=saga; total time=   0.6s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .......C=0.1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END ....C=1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .........C=1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .......C=0.1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s




[CV] END .........C=1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .........C=1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .........C=1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ....C=1, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .........C=1, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .........C=1, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END .........C=1, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END .........C=1, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END .........C=1, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END ....C=1, max_iter=200, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END .........C=1, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END ....C=1, max_iter=2



[CV] END .........C=1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l1, solver=saga; total time=   0.4s




[CV] END .........C=1, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END .........C=1, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END .........C=1, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ....C=1, max_iter=300, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ....C=1, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ....C=1, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ....C=1, max_iter=3



[CV] END .........C=1, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END .........C=1, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END .........C=1, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END .........C=1, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END .........C=1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END .........C=1, max_iter=300, penalty=l2, solver=saga; total time=   0.5s
[CV] END .........C=1, max_iter=300, penalty=l2, solver=saga; total time=   0.5s
[CV] END ...C=10, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END .........C=1, max_iter=300, penalty=l1, solver=saga; total time=   0.8s
[CV] END ...C=10, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=1



[CV] END ...C=10, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .........C=1, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ...C=10, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=100, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ........C=10, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ...C=10, max_iter=100, penalty=l2, solver=liblinear; total time=   0.1s




[CV] END ........C=10, max_iter=100, penalty=l1, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l1, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l2, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l2, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l2, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l2, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=100, penalty=l2, solver=saga; total time=   0.3s




[CV] END ...C=10, max_iter=200, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ...C=10, max_iter=200, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=200, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ...C=10, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ...C=10, max_iter=200, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ...C=10, max_iter=200, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ...C=10, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ........C=10, max_iter=200, penalty=l1, solver=saga; total time=   0.5s
[CV] END ........C=10, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END ........C=10, max_i



[CV] END ........C=10, max_iter=200, penalty=l1, solver=saga; total time=   0.5s
[CV] END ........C=10, max_iter=200, penalty=l1, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END ........C=10, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=200, penalty=l2, solver=saga; total time=   0.4s




[CV] END ...C=10, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=300, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END ........C=10, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END ...C=10, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ...C=10, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ...C=10, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s




[CV] END ........C=10, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END ........C=10, max_iter=300, penalty=l1, solver=saga; total time=   0.5s
[CV] END ........C=10, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END ........C=10, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END ........C=10, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=300, penalty=l2, solver=saga; total time=   0.4s
[CV] END ........C=10, max_iter=300, penalty=l1, solver=saga; total time=   0.7s
[CV] END ..C=100, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=100, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=1



[CV] END ..C=100, max_iter=100, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END .......C=100, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END .......C=100, max_iter=100, penalty=l1, solver=saga; total time=   0.2s
[CV] END ..C=100, max_iter=100, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END .......C=100, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END .......C=100, max_iter=100, penalty=l2, solver=saga; total time=   0.1s
[CV] END .......C=100, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END .......C=100, max_iter=100, penalty=l2, solver=saga; total time=   0.2s




[CV] END ..C=100, max_iter=200, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END .......C=100, max_iter=100, penalty=l2, solver=saga; total time=   0.2s
[CV] END ..C=100, max_iter=200, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=200, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=200, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=100, max_iter=200, penalty=l1, solver=liblinear; total time=   0.3s
[CV] END ..C=100, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..C=100, max_iter=200, penalty=l1, solver=liblinear; total time=   0.4s
[CV] END ..C=100, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..C=100, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..C=100, max_iter=200, penalty=l2, solver=liblinear; total time=   0.1s




[CV] END .......C=100, max_iter=200, penalty=l1, solver=saga; total time=   0.6s
[CV] END .......C=100, max_iter=200, penalty=l1, solver=saga; total time=   0.6s
[CV] END .......C=100, max_iter=200, penalty=l1, solver=saga; total time=   0.7s
[CV] END .......C=100, max_iter=200, penalty=l1, solver=saga; total time=   0.7s
[CV] END .......C=100, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END .......C=100, max_iter=200, penalty=l1, solver=saga; total time=   0.8s
[CV] END .......C=100, max_iter=200, penalty=l2, solver=saga; total time=   0.5s




[CV] END .......C=100, max_iter=200, penalty=l2, solver=saga; total time=   0.4s
[CV] END .......C=100, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END ..C=100, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END ..C=100, max_iter=300, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END .......C=100, max_iter=200, penalty=l2, solver=saga; total time=   0.3s
[CV] END ..C=100, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=100, max_iter=300, penalty=l2, solver=liblinear; total time=   0.0s
[CV] END ..C=100, max_iter=300, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..C=100, max_iter=300, penalty=l2, solver=liblinear; total time=   0.1s
[CV] END ..C=100, max_iter=3



[CV] END .......C=100, max_iter=300, penalty=l1, solver=saga; total time=   0.6s
[CV] END .......C=100, max_iter=300, penalty=l1, solver=saga; total time=   0.7s
[CV] END .......C=100, max_iter=300, penalty=l1, solver=saga; total time=   0.8s
[CV] END .......C=100, max_iter=300, penalty=l2, solver=saga; total time=   0.6s
[CV] END .......C=100, max_iter=300, penalty=l1, solver=saga; total time=   0.8s
[CV] END .......C=100, max_iter=300, penalty=l2, solver=saga; total time=   0.7s
[CV] END .......C=100, max_iter=300, penalty=l1, solver=saga; total time=   0.8s
[CV] END .......C=100, max_iter=300, penalty=l2, solver=saga; total time=   0.6s
[CV] END .......C=100, max_iter=300, penalty=l2, solver=saga; total time=   0.3s
[CV] END .......C=100, max_iter=300, penalty=l2, solver=saga; total time=   0.3s
Best Hyperparameters: {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}

Classification Report:
               precision    recall  f1-score   support

           0       0



In [94]:
from sklearn.ensemble import VotingClassifier
  
log_reg = LogisticRegression(random_state=42)
rf = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(random_state=42)
  
voting_clf = VotingClassifier(estimators=[('log_reg', log_reg), ('rf', rf), ('xgb', xgb)], voting='hard')
voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [95]:
from sklearn.preprocessing import PolynomialFeatures



# Create polynomial features (degree=2, interaction only)
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)  # We transform the test set using the same poly features

# Now, let's train the Logistic Regression model on the polynomial features
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_poly, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test_poly)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# ROC-AUC score
roc_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test_poly)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.76      0.74       443
           1       0.58      0.53      0.55       281

    accuracy                           0.67       724
   macro avg       0.65      0.64      0.64       724
weighted avg       0.66      0.67      0.67       724


Test Set Accuracy: 0.6685

ROC-AUC: 0.7354


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [96]:




# Create polynomial features (degree=2, interaction only)
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)  # We transform the test set using the same poly features

# Scale the features (important for convergence in Logistic Regression)
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)
X_test_poly_scaled = scaler.transform(X_test_poly)

# Train the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000, solver='saga')  # Increased max_iter and changed solver to 'saga'
log_reg.fit(X_train_poly_scaled, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test_poly_scaled)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# ROC-AUC score
roc_auc = roc_auc_score(y_test, log_reg.predict_proba(X_test_poly_scaled)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.78      0.75       443
           1       0.60      0.51      0.55       281

    accuracy                           0.68       724
   macro avg       0.66      0.64      0.65       724
weighted avg       0.67      0.68      0.67       724


Test Set Accuracy: 0.6754

ROC-AUC: 0.7138


In [97]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import ParameterSampler, KFold
from sklearn.metrics import recall_score
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

# Assuming X_train, y_train are already defined, where:
# X_train is the feature matrix (pandas DataFrame or numpy array)
# y_train is the target variable (pandas Series or numpy array)

# Define parameter distribution for GradientBoostingClassifier
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'subsample': [0.5, 0.75, 1.0]
}

# Generate random combinations of parameters
n_iter = 50  # Number of random configurations to try
param_sampler = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

# Initialize variables
best_params = None
best_score = -np.inf

# KFold Cross-validation setup
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Start custom search with tqdm for progress bar
print("Starting Randomized Search with tqdm...")
for params in tqdm(param_sampler, desc="Randomized Search Progress"):
    model = GradientBoostingClassifier(random_state=42, **params)
    scores = []

    # Cross-validation loop
    for train_idx, val_idx in kf.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Train the model
        model.fit(X_train_fold, y_train_fold)

        # Predict on validation set and calculate recall
        y_pred = model.predict(X_val_fold)
        recall = recall_score(y_val_fold, y_pred, average='macro')  # Macro-averaged recall
        scores.append(recall)

    # Compute mean recall for this parameter set
    mean_recall = np.mean(scores)

    # Update best parameters if current mean recall is better
    if mean_recall > best_score:
        best_score = mean_recall
        best_params = params

# Output the best parameters and score
print("Best Parameters:", best_params)
print("Best Recall (macro):", best_score)

Starting Randomized Search with tqdm...


Randomized Search Progress:   0%|          | 0/50 [00:00<?, ?it/s]

Best Parameters: {'subsample': 1.0, 'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 3, 'max_depth': 3, 'learning_rate': 0.05}
Best Recall (macro): 0.7149399869821046


In [98]:
# Import necessary libraries
from imblearn.over_sampling import SMOTE


# Assuming you have already split your data into X_train, X_test, y_train, and y_test

# Apply SMOTE to the training data to balance the classes
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Initialize the Gradient Boosting Classifier
model = GradientBoostingClassifier(random_state=42)

# Train the model on the resampled data
model.fit(X_train_smote, y_train_smote)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")

# ROC-AUC score
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
print(f"\nROC-AUC: {roc_auc:.4f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.72      0.74       443
           1       0.59      0.64      0.62       281

    accuracy                           0.69       724
   macro avg       0.68      0.68      0.68       724
weighted avg       0.70      0.69      0.69       724


Test Set Accuracy: 0.6906

ROC-AUC: 0.7527
