In [4]:
# import section

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score
from AUK import AUK

In [5]:
# Block 0: Load data

# Load best hyperparameters from pkl file
best_hyperparameters = pd.read_pickle('model_training/best_hyperparameters.pkl')

# Extract the first row from the 'params' column
params_dict = best_hyperparameters['params'].iloc[0]

# Ensure the 'params' column contains a dictionary
if isinstance(params_dict, dict):
    # Dynamically create variables from the dictionary keys
    for key, value in params_dict.items():
        locals()[key] = value

    # Derive dataframe_name from the params_dict or directly from 'dataframe_name' key
    if 'dataframe_name' in params_dict:
        dataframe_name = params_dict['dataframe_name']

        # Construct base folder for .pkl files
        base_folder = 'feature_selection'

        # Replace 'train' with 'test' and construct the test .pkl path
        test_dataframe_name = dataframe_name.replace('train', 'test', 1)
        test_pkl_file_path = f'{base_folder}/{test_dataframe_name}.pkl'

        # Construct the train .pkl path
        train_pkl_file_path = f'{base_folder}/{dataframe_name}.pkl'

        # Load train and test dataframes
        try:
            train_data = pd.read_pickle(train_pkl_file_path)
            test_data = pd.read_pickle(test_pkl_file_path)

            # Print to verify
            print(f"Loaded train dataframe from: {train_pkl_file_path}")
            print(f"Loaded test dataframe from: {test_pkl_file_path}")
        except FileNotFoundError as e:
            print(f"Error: {e}")
    else:
        print("Error: 'dataframe_name' not found in params_dict.")
else:
    print("Error: The 'params' column does not contain a dictionary.")

Loaded train dataframe from: feature_selection/train_df_all_features_10_RFE.pkl
Loaded test dataframe from: feature_selection/test_df_all_features_10_RFE.pkl


In [6]:
# Block 1: sample the data

# drop extra columns that are not needed for the model
test_data = test_data.drop(columns=['eid', 'p130894', 'p130895', 'p53_i0'])

# Sample the training data to 500 rows
train_sampled = train_data.sample(n=500, random_state=81)

# Separate features and target
X_train = train_sampled.drop(columns=['target']) 
y_train = train_sampled['target']
X_test = test_data.drop(columns=['target'])
y_test = test_data['target']

In [7]:
# Block 2: Hyperparameter tuning

# Define the parameter grid from the table
param_grid = {
    'penalty': ['l1', 'l2'],
    'solver': ['saga'],
    'C': [0.001, 0.005, 0.01, 0.02, 0.1, 0.5, 1, 10, 100],
    'max_iter': [100, 500, 1000, 2000, 3000, 5000]
}

# Create a Logistic Regression model
logistic_model = LogisticRegression(random_state=81)

# Randomized Search CV
random_search = RandomizedSearchCV(estimator=logistic_model, param_distributions=param_grid,
                                   n_iter=100, scoring='roc_auc', cv=5, random_state=81, n_jobs=-1)
random_search.fit(X_train, y_train)

# Best model after tuning
best_model = random_search.best_estimator_
print("Best hyperparameters:", random_search.best_params_)

Best hyperparameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 500, 'C': 0.5}


In [8]:
# Block 3: Model prediction on test data
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]  # Probability for the positive class

In [9]:
# Block 4: Calculate metrics

# Ensure y_test is passed as a list or numpy array to AUK
y_test_list = y_test.tolist()  # Convert y_test to a list to avoid index issues

# Calculate AUC
auc = roc_auc_score(y_test, y_pred_proba)
print("AUC:", auc)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Calculate precision
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# Calculate AUK
auk_class = AUK(probabilities=y_pred_proba, labels=y_test_list)
auk = auk_class.calculate_auk()
print("AUK:", auk)

AUC: 0.5190354923441604
Accuracy: 0.52
Confusion Matrix:
 [[148 110]
 [130 112]]
Precision: 0.5045045045045045
Recall: 0.4628099173553719
AUK: 0.019023681987061475
