In [1]:
import warnings

# Suppress all warnings to keep the output clean
warnings.filterwarnings("ignore")

# Import essential libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set a random seed for reproducibility
np.random.seed(42)

# Import essential libraries for machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

# Import metrics and feature selection tools
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.feature_selection import mutual_info_classif


In [2]:
def appendLabelToDf(dataFrame, columnToLabelBy, newColumnName):

    # Override the threshold value for labeling
    determinedThreshold = 50

    # Create a new column with binary labels based on the threshold
    dataFrame[newColumnName] = (dataFrame[columnToLabelBy] > determinedThreshold).astype(int)
    return dataFrame, determinedThreshold


def backwardEliminationLR(estimator, X_train, y_train, n_features_to_select):
    
    # Perform Recursive Feature Elimination (RFE) to select important features
    selector = RFE(estimator, n_features_to_select = n_features_to_select, step=1)
    selector = selector.fit(X_train, y_train)
    
    # Get the names of the selected features
    selectedFeatures = selector.get_feature_names_out(X_train.columns)
    
    # Calculate the score of the model using the selected features
    RFEscore = selector.score(X_train, y_train)
    return selectedFeatures, RFEscore


def plotEvaluation(y_pred, y_truth, setName):
    yTestPredicted = y_pred
    ygroundtruth = y_truth

    # Confusion matrix for the given set
    test_cm = confusion_matrix(ygroundtruth, yTestPredicted)

    # Plot the confusion matrix
    plt.figure(figsize = (4,3))
    plt.subplot(1, 1, 1)
    sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
    plt.xlabel('Predicted Class')
    plt.ylabel('Actual Class')
    plt.title(f'Confusion Matrix for the {setName} Set')
    plt.tight_layout()
    
    # Save the plot as a high-resolution image
    plt.savefig(f'figure-{setName}.png', dpi = 1000)
    plt.show()


In [7]:
# Define the filename of the Excel file containing the data frame
dataFrameFileName = "dataFrameHF.xlsx"

# Read the Excel file into a pandas DataFrame
dataFrameHF = pd.read_excel(dataFrameFileName)


In [9]:
# Calculate the percentage of non-zero entries in the "Prognosis Label" column
percentage_non_zero = 100 * (np.count_nonzero(dataFrameHF["Prognosis Label"]) / dataFrameHF["Prognosis Label"].shape[0])

# Display the percentage of non-zero entries
percentage_non_zero


48.275862068965516

In [24]:
def MIScoreBasedFeatureSelection(dataFrameHF, correlationIter, mutualInfoThreshold, test_size, testSizeCorrelationIter, random_state, isKfold, intersection = 0):
    """
    Perform Mutual Information (MI) score-based feature selection on the given dataset.

    Parameters:
        dataFrameHF (DataFrame): The input dataset.
        correlationIter (int): Number of iterations for correlation.
        mutualInfoThreshold (float): Threshold for mutual information scores.
        test_size (float): Proportion of the dataset to include in the test split.
        testSizeCorrelationIter (float): Test size for correlation iterations.
        random_state (int): Seed for random number generator.
        isKfold (bool): Flag to indicate whether k-fold cross-validation(for leave-one-out-cross-validation) is used.
        intersection (bool, optional): Flag to indicate whether to take intersection of features. Default is 0.

    Returns:
        tuple: (X_train, X_test, y_train, y_test), selectedFeatures
    """

    #### Train-test splitting ####
    if not isKfold:
        # Separate the target variable and feature variables
        datasetY = dataFrameHF[["Prognosis Label"]]
        datasetX = dataFrameHF.drop(["Prognosis Label", "Patient Name"], axis=1)
        # Split the dataset into training and test sets
        X_train, X_test, y_train, y_test = train_test_split(datasetX, datasetY, test_size=test_size, random_state=random_state, shuffle=False)
    else:
        # Use the provided dataset splits for k-fold cross-validation
        (X_train, X_test, y_train, y_test) = dataFrameHF

    ##################################################  
          
    # Define minimum and maximum thresholds for feature variance
    min_threshold = 10**-5
    max_threshold = 10**3

    # Calculate variance and maximum values for each feature
    variance_per_feature = X_train.var(axis=0)
    max_per_feature = X_train.max(axis=0)
    standardVar = np.abs(variance_per_feature / max_per_feature)
    # Select features within the defined variance thresholds
    featuresThresholded = X_train.columns[(standardVar >= min_threshold) & (standardVar <= max_threshold)].tolist()
    X_train, X_test = X_train[featuresThresholded], X_test[featuresThresholded]

    ##################################################
        
    ######################### FEATURE ELIMINATION-SELECTION ########################
    MIscoresList = list()
    # Calculate mutual information scores for all samples
    mutual_info_scoresofAllSamples = mutual_info_classif(X_train, y_train, random_state=42)
    # Select features with MI scores above the threshold
    allSamplesCorrelateFeatures = X_train.columns[np.where(mutual_info_scoresofAllSamples > mutualInfoThreshold)[0]]
    
    for corrIter in range(correlationIter):
        # Split the training data for correlation iterations
        X_trainCorrelationIter, X_testCorrIter, y_trainCorrIter, y_testCorrIter = train_test_split(X_train, y_train, test_size=testSizeCorrelationIter, random_state=corrIter, shuffle=True)
        
        # Calculate mutual information scores for the split data
        mutual_info_scores = mutual_info_classif(X_trainCorrelationIter, y_trainCorrIter, random_state=42)
        MIscoresList.append(mutual_info_scores)

    # Calculate mean mutual information scores across all iterations
    MIscoresArr = np.array(MIscoresList)
    MIscoresMeanArr = np.mean(MIscoresArr, axis=0)
    # Select features with mean MI scores above the threshold
    indices = np.where(MIscoresMeanArr > mutualInfoThreshold)[0]
    featureNamesThresholded = X_train.columns[indices]

    # Determine final selected features based on intersection if specified
    if intersection:
        selectedFeatures = list(set(featureNamesThresholded).intersection(allSamplesCorrelateFeatures))
    else:
        selectedFeatures = featureNamesThresholded

    # Select the final features for training and test sets
    X_train, X_test = X_train[selectedFeatures], X_test[selectedFeatures]
    
    return (X_train, X_test, y_train, y_test), selectedFeatures


In [29]:
def modelling(datasetSplits, penalty, C, solver):
    """
    Train and evaluate a logistic regression model using the provided dataset splits and hyperparameters.

    Parameters:
        datasetSplits (tuple): A tuple containing the training and test sets (X_train, X_test, y_train, y_test).
        penalty (str): The type of regularization to use ('l1', 'l2', etc.).
        C (float): Inverse of regularization strength; smaller values specify stronger regularization.
        solver (str): Algorithm to use in the optimization problem ('liblinear', 'saga', etc.).

    Returns:
        tuple: A tuple containing the training and test accuracies, and the trained classifier.
    """
    X_train, X_test, y_train, y_test = datasetSplits
    
    # Apply robust scaling to the training data
    scaler = RobustScaler()
    X_train = scaler.fit_transform(X_train)

    print("The classifier model is Logistic Regression:")
    classifier = LogisticRegression(C=C, max_iter=100, penalty=penalty, solver=solver)
    
    # Train the logistic regression model
    classifier.fit(X_train, y_train)
    
    # Predict on the training set and calculate training accuracy
    y_pred = classifier.predict(X_train)
    trainingAcc = accuracy_score(y_train, y_pred)
    print("Training Accuracy ->", round(trainingAcc, 2))

    # Apply the same scaling to the test data
    X_test = scaler.transform(X_test)
    
    # Predict on the test set and calculate test accuracy
    y_pred = classifier.predict(X_test)
    testAcc = accuracy_score(y_test, y_pred)

    print("Test Accuracy ->", round(testAcc, 2))
    print("-")
    
    return (round(trainingAcc, 2), round(testAcc, 2)), classifier


Hold out method-Logistic Regression!

In [None]:
# Set the mutual information threshold and number of iterations for correlation analysis
MIThreshold = 0.105
commoncorrIter = 15

# Print the mutual information threshold for reference
print(f"MI: {MIThreshold}")

# Perform feature selection based on mutual information scores
X_train, X_test, y_train, y_test, selectedFeaturesMI = MIScoreBasedFeatureSelection(
    dataFrameHF, 
    commoncorrIter=int(commoncorrIter), 
    MIThreshold=MIThreshold, 
    test_size=0.35, 
    test_size_corr_iter=0.2, 
    random_state=42, 
    isKfold=False, 
    intersection=True
)

# Prepare the dataset splits for modeling
datasetSplits = (X_train, X_test, y_train, y_test)

# Define parameters for Lasso regularization
CLasso = 1.2
penaltyLasso = "l1"

# Train a logistic regression model using Lasso regularization
estimator = modelling(datasetSplits=datasetSplits, penalty=penaltyLasso, C=CLasso, solver="liblinear")[1]

# Number of features to select via backward elimination
n_features_to_select = 5

# Perform backward elimination if the number of selected features exceeds the desired number
if len(selectedFeaturesMI) > n_features_to_select:
    selectedFeatures, RFEscore = backwardEliminationLR(estimator, X_train, y_train, n_features_to_select)
    selectedFeatures = list(selectedFeatures)
else:
    selectedFeatures = list(selectedFeaturesMI)

# Define parameters for final logistic regression model with different regularization
penaltyModelling = "l2"
CModelling = 0.2

# Print the final model parameters for reference
print(f"C: {CModelling}, penalty: {penaltyModelling}, n_features_to_select: {n_features_to_select}!")

# Train the final logistic regression model using the selected features
(trainingAcc, testAcc), estimator = modelling(
    datasetSplits=(X_train[selectedFeatures], X_test[selectedFeatures], y_train, y_test), 
    penalty=penaltyModelling, 
    C=CModelling, 
    solver="liblinear"
)

# Print the training and test accuracies of the final model
print(f"train accuracy: {trainingAcc}, test accuracy: {testAcc}")


K-FOLD(K=29)(LEAVE ONE OUT Cross Validation) CV METHOD

In [None]:
# Define the number of splits for K-Fold cross-validation
k = 29
kf = KFold(n_splits=k, shuffle=False)  # You can change the random_state value

# Prepare the dataset by separating the features and the target label
datasetY = dataFrameHF[["Prognosis Label"]]
datasetX = dataFrameHF.drop(["Prognosis Label", "Patient Name"], axis=1)

# Initialize a dictionary to store performance metrics
performanceDict = dict()
trainAcc, testAcc = 0, 0

# Perform K-Fold cross-validation
for train_index, test_index in kf.split(datasetX):
    X_train, X_test = datasetX.iloc[train_index, :], datasetX.iloc[test_index, :]
    y_train, y_test = datasetY.iloc[train_index, :], datasetY.iloc[test_index, :]

    ############################## MI-Based Thresholding ##############################################################
    (X_train_, X_test_, y_train_, y_test_), selectedFeaturesCorr = MIScoreBasedFeatureSelection(
        (X_train, X_test, y_train, y_test), 
        commoncorrIter=int(commoncorrIter), 
        mutualInfoThreshold=MIThreshold, 
        test_size=0.35, 
        test_size_corr_iter=0.2, 
        random_state=42, 
        isKfold=True, 
        intersection=True
    )
    ############################## Feature Selection using LASSO - Backward Elimination ################################
    (trainaccuracy_voting, testaccuracy_voting), estimator = modelling(
        datasetSplits=(X_train_, X_test_, y_train_, y_test_), 
        penalty=penaltyLasso, 
        C=CLasso, 
        solver="liblinear"
    )

    # Perform backward elimination if the number of selected features exceeds the desired number
    if len(selectedFeaturesCorr) > n_features_to_select:
        selectedFeatures, RFEscore = backwardEliminationLR(estimator, X_train_, y_train_, n_features_to_select)
        selectedFeatures = list(selectedFeatures)
    else:
        selectedFeatures = list(selectedFeaturesCorr)
    
    ############################## Ultimate Modelling and Testing ####################################################
    (trainingAcc, testAcc) = modelling(
        datasetSplits=(X_train_[selectedFeatures], X_test_[selectedFeatures], y_train_, y_test_), 
        penalty=penaltyModelling, 
        C=CModelling, 
        solver="liblinear"
    )[0]
    
    trainingAccVoting, testAccVoting = trainingAcc / k, testAcc / k
    
    # Accumulate the training and test accuracies over the K-Fold splits
    trainAcc += trainingAccVoting 
    testAcc += testAccVoting 

# Print the final training and test accuracies after K-Fold cross-validation
print(f"train accuracy: {trainAcc}, test accuracy: {testAcc}")

Box plots plotting to illustrate the distributions of features utilized in the modeling process

In [None]:
# Select the target label and features of interest
datasetY = dataFrameHF[["Prognosis Label"]]
datasetX = dataFrameHF.drop(["Prognosis Label", "Patient Name"], axis=1)[selectedFeatures]

# Combine selected features and target label into a single dataframe
dataFrameCombined = pd.concat([datasetX, datasetY], axis=1)

# Create five separate boxplots, one for each selected feature
for order, feature in enumerate(datasetX.columns):
    # Create a new figure for each boxplot
    plt.figure(figsize=(8, 5))
    
    # Plot a boxplot for the current feature
    sns.boxplot(x='Prognosis Label', y=feature, data=dataFrameCombined, palette={0: "green", 1: "red"})
    
    # Customize x-axis ticks and labels
    plt.xticks(ticks=[0, 1], labels=['5-year mortality < 50%', '5-year mortality > 50%'], fontweight='bold')
    
    # Customize labels and title
    plt.xlabel('Target', fontweight='bold')
    plt.ylabel(f"{feature} Values", fontweight='bold')
    plt.title(f'Boxplot of {feature} by Prognostic Label', fontweight='bold')
    
    # Customize y-axis ticks
    plt.yticks(fontweight='bold')
    
    # Ensure tight layout
    plt.tight_layout()
    
    # Save the plot as a PNG file with high resolution
    plt.savefig(f'Feature-{order + 1}_distribution.png', dpi=1000)
    
    # Display the plot
    plt.show()
