In [None]:
from model import LogRegCCD
from dataset import generate_synth_dataset
import numpy as np
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    recall_score,
    precision_score,
    f1_score,
    balanced_accuracy_score,
)
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
n_values = [50, 100, 500, 1000, 1500, 2000]
p_values = [0.1, 0.3, 0.5, 0.7, 0.9, 0.95]
d_values = [5, 10, 50, 100, 500, 1000]
g_values = [0, 0.1, 0.3, 0.4, 0.5, 0.7]
lambdas = np.logspace(-4, 2, 10)

results_lr = {
    "n": [],
    "p": [],
    "d": [],
    "g": []
}

results_ccd = {
    "n": [],
    "p": [],
    "d": [],
    "g": []
}
for param, values in zip(["n", "p", "d", "g"], [n_values, p_values, d_values, g_values]):
    auc_lr_list = []
    bal_lr_list = []
    auc_ccd_list = []
    bal_ccd_list = []

    for val in values:
        if param == "n":
            data = generate_synth_dataset(p=0.5, n=val, d=10, g=0.5)
        elif param == "p":
            data = generate_synth_dataset(p=val, n=500, d=10, g=0.5)
        elif param == "d":
            data = generate_synth_dataset(p=0.5, n=500, d=val, g=0.5)
        elif param == "g":
            data = generate_synth_dataset(p=0.5, n=500, d=10, g=val)

        X, y = data.drop(columns=['Y']).values, data['Y'].values

        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
        X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
        
            
        # --- Logistic Regression (Baseline) ---
        
        lr = LogisticRegression(penalty=None)
        lr.fit(X_train, y_train)

        y_pred_proba_lr = lr.predict_proba(X_test)[:, 1]  
        y_pred_lr = (y_pred_proba_lr >= 0.5).astype(int)  

        auc_lr = roc_auc_score(y_test, y_pred_proba_lr)
        bal_lr = balanced_accuracy_score(y_test, y_pred_lr)

        auc_lr_list.append(auc_lr)
        bal_lr_list.append(bal_lr)

        results_lr[param] = (values, auc_lr_list, bal_lr_list)


        # --- LogRecCCD ---

        logreg_ccd = LogRegCCD(lambdas)
        logreg_ccd.fit(X_train, y_train, alpha=0.1)

        logreg_ccd.validate(X_valid, y_valid)
        y_pred_proba_ccd = logreg_ccd.predict_proba(X_test)
        auc_ccd = roc_auc_score(y_test, y_pred_proba_ccd)
        auc_ccd_list.append(auc_ccd)

        logreg_ccd.validate(X_valid, y_valid, measure='balanced_accuracy')
        y_pred = logreg_ccd.predict(X_test)
        bal_ccd = logreg_ccd.compute_measure(y_test, y_pred, "balanced_accuracy")   
        bal_ccd_list.append(bal_ccd)

        results_ccd[param] = (values, auc_ccd_list, bal_ccd_list)

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(values, auc_lr_list, marker='o', linestyle='-', label="Logistic Regression", color='blue')
plt.plot(values, auc_ccd_list, marker='o', linestyle='-', label="LogRegCCD", color='red')
plt.xlabel(param)
plt.ylabel("AUC Score")
plt.title(f"AUC for different values of {param}")
plt.legend()
plt.grid(True)
plt.show()

plt.figure(figsize=(6, 4))
plt.plot(values, bal_lr_list, marker='o', linestyle='-', label="Logistic Regression", color='blue')
plt.plot(values, bal_ccd_list, marker='o', linestyle='-', label="LogRegCCD", color='red')
plt.xlabel(param)
plt.ylabel("Balanced Accuracy")
plt.title(f"Balanced Accuracy for different values of {param}")
plt.legend()
plt.grid(True)
plt.show()