In [None]:
from model import LogRegCCD
from dataset import generate_synth_dataset
import numpy as np
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
breast_df = pd.read_csv('breast.csv')
prostate_df = pd.read_csv('prostate.csv')

breast_df.rename(columns={'Class': 'class'}, inplace=True)
breast_df['class'] = breast_df['class'].map({'non-relapse': 0, 'relapse': 1})
prostate_df['class'] = prostate_df['class'].replace(1, 0).replace(2,1)

LogRecCCD

In [None]:
datasets = [breast_df, prostate_df]
dataset_names = ["Breast", "Prostate"]
lambdas = np.logspace(-4, 2, 10)
logreg_ccd = LogRegCCD(lambdas)

metrics = ['roc_auc', 'balanced_accuracy', 'recall', 'f_measure', 'precision']
metric_values = {metric: [] for metric in metrics}

for df, name in zip(datasets, dataset_names):
    X, y = df.drop(columns=['class']).values, df['class'].values
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    logreg_ccd.fit(X_train, y_train)

    logreg_ccd.validate(X_valid, y_valid)
    y_pred_proba = logreg_ccd.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_pred_proba)
    metric_values['roc_auc'].append(auc)

    logreg_ccd.validate(X_valid, y_valid, measure='balanced_accuracy')
    y_pred = logreg_ccd.predict(X_test)
    bal = logreg_ccd.compute_measure(y_test, y_pred, "balanced_accuracy")
    metric_values['balanced_accuracy'].append(bal)

    logreg_ccd.validate(X_valid, y_valid, measure='recall')
    y_pred = logreg_ccd.predict(X_test)
    recall = logreg_ccd.compute_measure(y_test, y_pred, "recall")
    metric_values['recall'].append(recall)

    logreg_ccd.validate(X_valid, y_valid, measure='f_measure')
    y_pred = logreg_ccd.predict(X_test)
    f_measure = logreg_ccd.compute_measure(y_test, y_pred, "f_measure")
    metric_values['f_measure'].append(f_measure)

    logreg_ccd.validate(X_valid, y_valid, measure='precision')
    y_pred = logreg_ccd.predict(X_test)
    prec = logreg_ccd.compute_measure(y_test, y_pred, "precision")
    metric_values['precision'].append(prec)


In [None]:
for df, name in zip(datasets, dataset_names):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    dataset_metric_values = [metric_values[metric][dataset_names.index(name)] for metric in metrics]
    
    index = np.arange(len(metrics))
    
    ax.bar(index, dataset_metric_values, color=['skyblue', 'lightgreen', 'salmon', 'pink', 'purple'])

    ax.set_xlabel('Metric')
    ax.set_ylabel('Metric Value')
    ax.set_title(f'{name} Dataset Metrics')
    ax.set_xticks(index)
    ax.set_xticklabels(metrics)
    ax.set_ylim(0, 1.1)  

    plt.tight_layout()
    plt.show()

Logistic Regression

In [None]:
metrics1 = ['roc_auc', 'balanced_accuracy', 'recall', 'f_measure', 'precision']
metric_values1 = {metric: [] for metric in metrics1}

lr = LogisticRegression(penalty=None)

for df, name in zip(datasets, dataset_names):
    X, y = df.drop(columns=['class']).values, df['class'].values
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    lr.fit(X_train, y_train)
    y_pred_proba = lr.predict_proba(X_test)[:, 1] 
    y_pred = (y_pred_proba >= 0.5).astype(int)

    auc = roc_auc_score(y_test, y_pred_proba)
    metric_values1['roc_auc'].append(auc)

    bal = balanced_accuracy_score(y_test, y_pred)
    metric_values1['balanced_accuracy'].append(bal)

    recall = recall_score(y_test, y_pred)
    metric_values1['recall'].append(recall)

    f_measure = f1_score(y_test, y_pred)
    metric_values1['f_measure'].append(f_measure)

    prec = precision_score(y_test, y_pred)
    metric_values1['precision'].append(prec)

In [None]:

for df, name in zip(datasets, dataset_names):
    fig, ax = plt.subplots(figsize=(10, 6))
    
    dataset_metric_values = [metric_values1[metric][dataset_names.index(name)] for metric in metrics1]
    
    index = np.arange(len(metrics1))
    
    ax.bar(index, dataset_metric_values, color=['skyblue', 'lightgreen', 'salmon', 'pink', 'purple'])

    ax.set_xlabel('Metric')
    ax.set_ylabel('Metric Value')
    ax.set_title(f'{name} Dataset Metrics (Logistic Regression)')
    ax.set_xticks(index)
    ax.set_xticklabels(metrics)
    ax.set_ylim(0, 1.1)  

    plt.tight_layout()
    plt.show()