In [1]:
# import section
import pandas as pd
import pickle
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, roc_auc_score
from pyfume import Clustering, AntecedentEstimator, FireStrengthCalculator, ConsequentEstimator, SugenoFISBuilder, SugenoFISTester

In [2]:
# load optimal decision threshold
with open('model_training/optimal_decision_threshold.pkl', 'rb') as f:
    optimal_decision_threshold = pickle.load(f)

In [3]:
# load best hyperparameters from pkl file into pandas dataframe

best_hyperparameters = pd.read_pickle('model_training/best_hyperparameters.pkl')

# Extract the first row from the 'params' column
params_dict = best_hyperparameters['params'].iloc[0] 

# Verify the type to ensure it's already a dictionary
if isinstance(params_dict, dict):
    # Dynamically create variables with names corresponding to the dictionary keys
    for key, value in params_dict.items():
        locals()[key] = value

    # Print the variables to verify
    for key in params_dict.keys():
        print(f"{key}: {locals()[key]}")
else:
    print("Error: The 'params' column does not contain a dictionary.")

    # Base folder for .pkl files
base_folder = 'feature_selection'

# Construct the full path
pkl_file_path = f'{base_folder}/{dataframe_name}.pkl'

# Load the dataframe and assign it to train_df
train_df = pd.read_pickle(pkl_file_path)

# Print to verify
print(f"Loaded dataframe from: {pkl_file_path}")

sample_size: 500
random_state: 2024
nr_clus: 7
mf_shape: sigmoid
merge_threshold: 1.0
m: 2.0
dataframe_name: train_df_all_drop_features_10_PRS
covariance_type: full
consequent_method: local_LSE
clustering_method: gk
Loaded dataframe from: feature_selection/train_df_all_drop_features_10_PRS.pkl


In [4]:
# Function to run a single experiment with given train/test data and parameters
def run_single_experiment(x_train, y_train, x_test, y_test, variable_names, nr_clus=10, clustering_method='gmm', mf_shape='gauss2', consequent_method="global_LSE", merge_threshold=1.0, m=1.5, covariance_type='spherical', optimal_decision_threshold=optimal_decision_threshold):
    # Create a Clusterer object and run clustering
    cl = Clustering.Clusterer(x_train=x_train, y_train=y_train, nr_clus=nr_clus)

    # Adjust the clustering method based on the parameters
    if clustering_method in ['fcm', 'fcm_binary', 'fst-pso', 'gk']:
        # Pass the fuzziness coefficient m for these methods
        cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method, m=m)
    elif clustering_method == 'gmm':
        # Pass covariance_type for GMM
        cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method, covariance_type=covariance_type)
    else:
        # For other methods, run the clustering normally
        cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method)

    # Estimate membership functions
    ae = AntecedentEstimator(x_train=x_train, partition_matrix=partition_matrix)
    antecedent_parameters = ae.determineMF(mf_shape=mf_shape, merge_threshold=merge_threshold)

    # Calculate firing strengths
    fsc = FireStrengthCalculator(antecedent_parameters=antecedent_parameters, nr_clus=nr_clus, variable_names=variable_names)
    firing_strengths = fsc.calculate_fire_strength(data=x_train)

    # Estimate consequent parameters
    ce = ConsequentEstimator(x_train=x_train, y_train=y_train, firing_strengths=firing_strengths)
    consequent_parameters = ce.zero_order(method=consequent_method)

    # Build the Takagi-Sugeno fuzzy inference system (FIS) model
    simpbuilder = SugenoFISBuilder(
        antecedent_sets=antecedent_parameters,
        consequent_parameters=consequent_parameters,
        variable_names=variable_names,
        model_order='zero',
        extreme_values=None,
        save_simpful_code=False
    )
    model = simpbuilder.get_model()

    # Evaluate on the test data
    test = SugenoFISTester(model=model, test_data=x_test, variable_names=variable_names, golden_standard=y_test)
    y_pred_proba = test.predict()[0]  # Get predicted probabilities

    # Use optimal decision threshold as the threshold for binary classification
    y_pred = (y_pred_proba > optimal_decision_threshold).astype(int)

    # Calculate accuracy and AUC
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    return accuracy, auc

# Function to run 10-fold cross-validation
def run_cross_validation(df, variable_names, sample_size=2000, nr_clus=10, clustering_method='gmm', mf_shape='gauss2', consequent_method="global_LSE", merge_threshold=1.0, m=1.5, covariance_type='spherical', random_state=2024, optimal_decision_threshold=optimal_decision_threshold):
    # Sample the dataframe if needed
    df = df.sample(n=sample_size, random_state=random_state)
    
    # Separate features and target
    X = df[variable_names]  # Use only the variables defined (excluding 'target')
    y = df['target']

    # Prepare for cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=random_state)

    # Lists to store results
    accuracies = []
    aucs = []

    # Run cross-validation
    for train_index, test_index in kf.split(X):
        x_train, x_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Convert x_train, x_test to NumPy arrays for clustering and further processing
        x_train_np = x_train.to_numpy()
        x_test_np = x_test.to_numpy()

        # Run a single experiment on this fold
        accuracy, auc = run_single_experiment(x_train_np, y_train, x_test_np, y_test, variable_names, nr_clus, clustering_method, mf_shape, consequent_method, merge_threshold, m, covariance_type, optimal_decision_threshold)

        # Append the results
        accuracies.append(accuracy)
        aucs.append(auc)

    # Calculate statistics
    mean_accuracy = np.mean(accuracies)
    mean_auc = np.mean(aucs)
    std_accuracy = np.std(accuracies)
    std_auc = np.std(aucs)

    return accuracies, aucs, mean_accuracy, mean_auc, std_accuracy, std_auc


# Run the cross-validation

# Assuming df is your dataframe already loaded and contains 'target' column
variable_names = train_df.columns[:-1]  # Exclude 'target'

# Run 10-fold cross-validation
accuracies, aucs, mean_accuracy, mean_auc, std_accuracy, std_auc = run_cross_validation(
    train_df,
    variable_names,
    sample_size=sample_size,
    nr_clus=nr_clus,
    clustering_method=clustering_method,
    mf_shape=mf_shape,
    consequent_method=consequent_method,
    merge_threshold=merge_threshold,
    m=m,
    covariance_type=covariance_type,
    random_state=random_state
)

# Print the results
print(f"Accuracies: {accuracies}")
print(f"AUCs: {aucs}")
print(f"Mean Accuracy: {mean_accuracy}")
print(f"Mean AUC: {mean_auc}")
print(f"Standard Deviation of Accuracy: {std_accuracy}")
print(f"Standard Deviation of AUC: {std_auc}")

Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed

In [5]:
# export results for later analysis

# Convert results to a dictionary
cv_results_dict = {
    "Accuracies": accuracies,
    "AUCs": aucs,
    "Mean Accuracy": mean_accuracy,
    "Mean AUC": mean_auc,
    "Standard Deviation of Accuracy": std_accuracy,
    "Standard Deviation of AUC": std_auc
}

# Save the dictionary as a .pkl file
with open('model_training/cross_validation_results.pkl', 'wb') as f:
    pickle.dump(cv_results_dict, f)

print("Results saved to 'model_training/cross_validation_results.pkl'")

Results saved to 'model_training/cross_validation_results.pkl'
