In [1]:
# import
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from pyfume import Clustering
from pyfume import AntecedentEstimator
from pyfume import FireStrengthCalculator
from pyfume import SugenoFISBuilder
from pyfume import ConsequentEstimator
from pyfume import SugenoFISTester
from AUK import AUK
import pickle

In [2]:
# load best hyperparameters from pkl file into pandas dataframe

best_hyperparameters = pd.read_pickle('model_training/best_hyperparameters.pkl')

best_hyperparameters['params'].iloc[0]

{'sample_size': 500,
 'random_state': 2024,
 'nr_clus': 7,
 'mf_shape': 'sigmoid',
 'merge_threshold': 1.0,
 'm': 2.0,
 'dataframe_name': 'train_df_all_drop_features_10_PRS',
 'covariance_type': 'full',
 'consequent_method': 'local_LSE',
 'clustering_method': 'gk'}

In [3]:
# Extract the first row from the 'params' column
params_dict = best_hyperparameters['params'].iloc[0] 

# Verify the type to ensure it's already a dictionary
if isinstance(params_dict, dict):
    # Dynamically create variables with names corresponding to the dictionary keys
    for key, value in params_dict.items():
        locals()[key] = value

    # Print the variables to verify
    for key in params_dict.keys():
        print(f"{key}: {locals()[key]}")
else:
    print("Error: The 'params' column does not contain a dictionary.")

sample_size: 500
random_state: 2024
nr_clus: 7
mf_shape: sigmoid
merge_threshold: 1.0
m: 2.0
dataframe_name: train_df_all_drop_features_10_PRS
covariance_type: full
consequent_method: local_LSE
clustering_method: gk


In [4]:
# Base folder for .pkl files
base_folder = 'feature_selection'

# Construct the full path
pkl_file_path = f'{base_folder}/{dataframe_name}.pkl'

# Load the dataframe and assign it to train_df
train_df = pd.read_pickle(pkl_file_path)

# Print to verify
print(f"Loaded dataframe from: {pkl_file_path}")

Loaded dataframe from: feature_selection/train_df_all_drop_features_10_PRS.pkl


In [5]:
# Sample the dataframe
df = train_df.sample(n=sample_size, random_state=random_state)

# Split the data into features and target
X = df.drop(columns=['target'])
y = df['target']

# Perform an 80/20 train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

# Convert to NumPy arrays
x_train = x_train.to_numpy()
x_test = x_test.to_numpy()
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

# Variable names for future steps
variable_names = df.columns[:-1]

In [6]:
# Create a Clusterer object and run clustering
cl = Clustering.Clusterer(x_train=x_train, y_train=y_train, nr_clus=nr_clus)

# Adjust the clustering method based on the parameters
if clustering_method in ['fcm', 'fcm_binary', 'fst-pso', 'gk']:
    # Pass the fuzziness coefficient m for these methods
    cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method, m=m)
elif clustering_method == 'gmm':
    # Pass covariance_type for GMM
    cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method, covariance_type=covariance_type)
else:
    # For other methods, run the clustering normally
    cluster_centers, partition_matrix, _ = cl.cluster(method=clustering_method)

In [7]:
# Estimate membership functions
ae = AntecedentEstimator(x_train=x_train, partition_matrix=partition_matrix)
antecedent_parameters = ae.determineMF(mf_shape=mf_shape, merge_threshold=merge_threshold)

Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed to fit sigmoidal membership function, falling back to Gaussian.
Failed

In [8]:
# Calculate firing strengths
fsc = FireStrengthCalculator(antecedent_parameters=antecedent_parameters, nr_clus=nr_clus, variable_names=variable_names)
firing_strengths = fsc.calculate_fire_strength(data=x_train)

Index(['p26202', 'p26204', 'p26206', 'p26210', 'p26212', 'p26214', 'p26216',
       'p26218', 'p26220', 'p26223', 'p26225', 'p26227', 'p26229', 'p26232',
       'p26234', 'p26238', 'p26240', 'p26242', 'p26244', 'p26246', 'p26248',
       'p26250', 'p26252', 'p26254', 'p26258', 'p26260', 'p26265', 'p26267',
       'p26269', 'p26273', 'p26275', 'p26278', 'p26283', 'p26285', 'p26287',
       'p26289', 'p30190_i0', 'p2814_i0_Category_C', 'p23061_i0_Category_C',
       'p2946_i0_Category_E', 'p738_i0_Category_D', 'p2877_i0_Category_E',
       'p1940_i0_Category_D', 'p5496_i0_Category_B', 'p3090_i0_Category_D',
       'p5001_i0_Category_E'],
      dtype='object')


In [9]:
# Estimate consequent parameters

ce = ConsequentEstimator(x_train=x_train, y_train=y_train, firing_strengths=firing_strengths)
consequent_parameters = ce.zero_order(method=consequent_method)

# Build the Sugeno FIS model
simpbuilder = SugenoFISBuilder(
    antecedent_sets=antecedent_parameters,
    consequent_parameters=consequent_parameters,
    variable_names=variable_names,
    model_order='zero',
    extreme_values=None,
    save_simpful_code=False
)
model = simpbuilder.get_model()

 * Detected 7 rules / clusters
 * Detected Sugeno model type


In [10]:
# Create a tester object and predict probabilities
test = SugenoFISTester(model=model, test_data=x_test, variable_names=variable_names, golden_standard=y_test)
y_pred_proba = test.predict()[0]  # Extract probabilities from the tuple

In [11]:
# Initialize AUK class
auk_calculator = AUK(probabilities=y_pred_proba, labels=y_test)

# Calculate the kappa curve
kappa_curve = auk_calculator.kappa_curve()

# Collect the corresponding thresholds
thresholds = auk_calculator.probabilities_set

# Ensure that both kappa_curve and thresholds have the same length
# Truncate or pad the shorter list to match the length of the other
min_length = min(len(kappa_curve), len(thresholds))
kappa_curve = kappa_curve[:min_length]
thresholds = thresholds[:min_length]

# Create a DataFrame to store kappa curve values and thresholds
kappa_data = pd.DataFrame({
    'Threshold': thresholds,
    'Kappa': kappa_curve
})

# Export the kappa curve values to a CSV file
output_csv_file = 'model_training/kappa_curve_values.csv'
kappa_data.to_csv(output_csv_file, index=False)

print(f"Kappa curve values exported to {output_csv_file}")

# Find the optimal threshold (threshold with the highest kappa score)
optimal_threshold_index = kappa_curve.index(max(kappa_curve))
optimal_threshold = thresholds[optimal_threshold_index]
print(f"Optimal threshold: {optimal_threshold}")

Kappa curve values exported to model_training/kappa_curve_values.csv
Optimal threshold: 0.5309535935696522


In [12]:
# Initialize AUK class
auk_calculator = AUK(probabilities=y_pred_proba, labels=y_test)

# Define custom evenly spaced thresholds
evenly_spaced_thresholds = np.linspace(0, 1, num=50)  # 50 evenly spaced thresholds from 0 to 1

# Calculate the kappa values for these thresholds
kappa_values = []
for threshold in evenly_spaced_thresholds:
    predictions = (y_pred_proba >= threshold).astype(int)
    tp, tn, fp, fn = auk_calculator.confusion_matrix(predictions)
    kappa = auk_calculator.calculate_kappa(tp, tn, fp, fn)
    kappa_values.append(kappa)

# Create a DataFrame to store kappa values and thresholds
kappa_data = pd.DataFrame({
    'Threshold': evenly_spaced_thresholds,
    'Kappa': kappa_values
})

# Export the kappa curve values to a CSV file
output_csv_file = 'model_training/evenly_spaced_kappa_curve_values.csv'
kappa_data.to_csv(output_csv_file, index=False)

print(f"Kappa curve values exported to {output_csv_file}")

# Find the optimal threshold (threshold with the highest kappa score)
optimal_threshold_index = kappa_values.index(max(kappa_values))
optimal_threshold_even = evenly_spaced_thresholds[optimal_threshold_index]
print(f"Optimal threshold: {optimal_threshold_even}")

Kappa curve values exported to model_training/evenly_spaced_kappa_curve_values.csv
Optimal threshold: 0.0


In [13]:
# Convert probabilities to binary predictions using the optimal threshold
y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)

# Calculate accuracy
accuracy_optimal = accuracy_score(y_test, y_pred_optimal)

# Calculate AUC
auc_optimal = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy at optimal threshold: {accuracy_optimal}")
print(f"AUC at optimal threshold: {auc_optimal}")

Accuracy at optimal threshold: 0.58
AUC at optimal threshold: 0.5716579686872741


In [14]:
# save optimal decision threshold to pkl file

# Save threshold to a .pkl file
with open('model_training/optimal_decision_threshold.pkl', 'wb') as f:
    pickle.dump(optimal_threshold, f)