In [1]:
import os
import torch
import random
import numpy as np
import pandas as pd 
import pickle

from sklearn.metrics import make_scorer
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score, matthews_corrcoef, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

from ltn_imp.automation.knowledge_base import KnowledgeBase

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Find the Best Hyperparameters

In [3]:
seed = 42
state = 123

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [4]:
dataset = pd.read_csv("datasets/pima_indians_imputed.csv", index_col = 0)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
initilize_models = False

In [5]:
if initilize_models:
    # Define the models
    models = {
        'DecisionTree': DecisionTreeClassifier(random_state=state),
        'GradientBoosting': GradientBoostingClassifier(random_state=state),
        'MultiLayerPerceptron': MLPClassifier(random_state=state, max_iter=2000),
        'LogisticRegression': LogisticRegression(random_state=state, max_iter=1000),
        'RandomForest': RandomForestClassifier(random_state=state),
        'KNearestNeighbor': KNeighborsClassifier()
    }

    # Define parameter grids for each model
    param_grids = {
        'DecisionTree': {
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [5, 10, 20],
            'min_samples_leaf': [5, 10]
        },
        'GradientBoosting': {
            'n_estimators': np.linspace(50, 250, 5).astype(int),
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 4, 5]
        },
        'MultiLayerPerceptron': {
            'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (30, 30, 30), (50, 30, 20), (100, 50, 25)],
            'activation': ['relu', 'tanh'],
            'solver': ['adam'],
            'alpha': [0.0001, 0.001, 0.01, 0.1],
            'learning_rate': ['constant', 'adaptive']
        },
        'LogisticRegression': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l2'],
            'solver': ['lbfgs']
        },
        'RandomForest': {
            'n_estimators': np.linspace(50, 250, 5).astype(int),
            #'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth': [None, 5, 10, 15, 20],
            'min_samples_split': [5, 10, 20],
            'min_samples_leaf': [5, 10]
        },
        'KNearestNeighbor': {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        }
    }

    # Define the scorer (using recall for refitting as in your example)
    scorer = make_scorer(recall_score)

    # Perform nested cross-validation
    outer_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
    inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

    # Iterate over each model to perform grid search and save the best model
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        best_recall = 0
        best_model = None
        
        for train_idx, test_idx in outer_cv.split(X, y):
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

            # Perform grid search with cross-validation
            clf = GridSearchCV(estimator=model, param_grid=param_grids[model_name], scoring=scorer, refit='recall', cv=inner_cv, n_jobs=-1)
            clf.fit(X_train, y_train)
            
            # Check if this model has a better recall score
            current_recall = recall_score(y_test, clf.predict(X_test))
            if current_recall > best_recall:
                best_recall = current_recall
                best_model = clf.best_estimator_
        
        # Save the best model using pickle if it's better than what was previously found
        if best_model is not None:
            with open(f'models/{model_name}_best_model.pkl', 'wb') as f:
                pickle.dump(best_model, f)
            print(f"Best model for {model_name} saved with recall score of {best_recall:.4f}")
            print()

Evaluating DecisionTree...
Best model for DecisionTree saved with recall score of 0.8148

Evaluating GradientBoosting...
Best model for GradientBoosting saved with recall score of 0.7692

Evaluating MultiLayerPerceptron...
Best model for MultiLayerPerceptron saved with recall score of 0.7037

Evaluating LogisticRegression...
Best model for LogisticRegression saved with recall score of 0.8077

Evaluating RandomForest...
Best model for RandomForest saved with recall score of 0.7037

Evaluating KNearestNeighbor...
Best model for KNearestNeighbor saved with recall score of 0.7778



# Train and Evaluate Models

In [6]:
from sklearn.model_selection import train_test_split

seed = 42

test_data = pd.read_csv('datasets/pima_indians_imputed.csv', index_col=0).astype(float)

y = test_data.iloc[:, -1]

x_train, x_test = train_test_split(test_data, test_size=0.5, random_state=seed, stratify=y)

x_train.to_csv('datasets/train.csv')
x_test.to_csv('datasets/test.csv')

x_train, y_train = x_train.iloc[:, :-1], x_train.iloc[:, -1]
x_test, y_test = x_test.iloc[:, :-1], x_test.iloc[:, -1]

In [7]:
models = []

for file_name in os.listdir("models"):
    if file_name.endswith('.pkl'):
        file_path = os.path.join("models", file_name)
        
        with open(file_path, 'rb') as file:
            trained_model = pickle.load(file)
            
            # Extract the hyperparameters of the loaded model
            model_class = trained_model.__class__
            model_params = trained_model.get_params()
            
            # Reinitialize the model with the same hyperparameters but without the trained state
            new_model = model_class(**model_params)
            
            # Append the reinitialized model to the models list
            models.append(new_model)

In [8]:
def predict(model, x):
    try:
        model.eval()  # Ensure the model is in evaluation mode
    except:
        ""

    with torch.no_grad():  # No need to track gradients
        # Ensure x is a tensor and has the right dtype
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x, dtype=torch.float32)
        elif x.dtype != torch.float32:
            x = x.float()
        
        # Forward pass through the model
        try:
            probs = model(x)
        except:
            probs = torch.tensor(model.predict(x))

        # Apply binary classification threshold at 0.5
        preds = (probs > 0.5).float()
    return preds

def compute_metrics(model, data_loader):
    all_true_labels = []
    all_predicted_labels = []
    
    with torch.no_grad():  # Disable gradient computation
        for data, labels in data_loader:
            # Ensure data and labels are the correct dtype
            if not isinstance(data, torch.Tensor):
                data = torch.tensor(data, dtype=torch.float32)
            elif data.dtype != torch.float32:
                data = data.float()
            
            if not isinstance(labels, torch.Tensor):
                labels = torch.tensor(labels, dtype=torch.float32)
            elif labels.dtype != torch.float32:
                labels = labels.float()
            
            # Get predictions
            preds = predict(model, data)

            # Squeeze predictions and labels to remove dimensions of size 1
            predicted_labels = preds.squeeze()
            true_labels = labels.squeeze()

            # Ensure the shapes match before comparison
            if predicted_labels.shape != true_labels.shape:
                true_labels = true_labels.view_as(predicted_labels)
            
            # Collect all predictions and true labels for MCC
            all_true_labels.extend(true_labels.cpu().numpy())
            all_predicted_labels.extend(predicted_labels.cpu().numpy())

    true_labels = np.array(all_true_labels)
    predicted_labels = np.array(all_predicted_labels)
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, zero_division=0)  # zero_division=0 handles the division by zero case
    recall = recall_score(true_labels, predicted_labels, zero_division=0)
    f1 = f1_score(true_labels, predicted_labels, zero_division=0)
    balanced_accuracy = balanced_accuracy_score(true_labels, predicted_labels)
    mcc = matthews_corrcoef(true_labels, predicted_labels)

    try:
        model.train()
    except:
        ""

    return accuracy, precision, recall, f1, balanced_accuracy, mcc

In [9]:
kb = KnowledgeBase("medical_config.yaml")

In [10]:
models = []

# Iterate over all files in the "models" folder

for file_name in os.listdir("models"):
    if file_name.endswith('.pkl'):
        file_path = os.path.join("models", file_name)

        with open(file_path, 'rb') as file:
            # Load the trained model from the pickle file
            trained_model = pickle.load(file)
            
            # Extract the hyperparameters of the loaded model
            model_class = trained_model.__class__
            model_params = trained_model.get_params()
            
            # Reinitialize the model with the same hyperparameters but without the trained state
            new_model = model_class(**model_params)
            
            # Append the reinitialized model to the models list
            models.append(new_model)
            
    elif file_name.endswith('.pth'):
        file_path = os.path.join("models", file_name)
        models.append(torch.load(file_path))
        continue


In [11]:
X = kb.loaders[0].loader.dataset.data[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']]

Y = kb.loaders[0].loader.dataset.data['Outcome']

In [12]:
scores= pd.DataFrame([], columns=["Accuracy", "Precision", "Recall", "F1 Score", "Balanced Accuracy", "MCC"])

for model in models:
    if hasattr(model,"fit"):
        model.fit(X, Y)
        
    accuracy, precision, recall, f1, balanced_accuracy, mcc = compute_metrics(model, kb.test_loaders[0])
    scores.loc[model.__class__.__name__] = [accuracy, precision, recall,f1,balanced_accuracy, mcc]

In [13]:
scores.round(3).sort_values(by="Recall", ascending=False)

Unnamed: 0,Accuracy,Precision,Recall,F1 Score,Balanced Accuracy,MCC
Sequential,0.763,0.619,0.836,0.711,0.78,0.535
GradientBoostingClassifier,0.719,0.587,0.657,0.62,0.704,0.399
RandomForestClassifier,0.745,0.634,0.634,0.634,0.719,0.438
DecisionTreeClassifier,0.75,0.646,0.627,0.636,0.721,0.446
KNeighborsClassifier,0.721,0.599,0.612,0.605,0.696,0.39
LogisticRegression,0.747,0.653,0.59,0.62,0.711,0.433
MLPClassifier,0.708,0.62,0.425,0.504,0.643,0.319
