In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from src.data.data_loader import OpenMLDataLoader
from src.data.dataset_interface import DataInterface
from src.eda.eda import print_basic_info
from src.log_reg_ccd import LogRegCCD
from src.utils import plot_lasso_path, evaluate_model
import src.measures as measure

In [None]:
# TODO: insert dataset name
DATASET_NAME = 'DATASET'

In [None]:
data_loader = OpenMLDataLoader(dataset_name=DATASET_NAME, version=1)
data_interface = DataInterface(data_loader=data_loader, dataset_name=DATASET_NAME)
data_interface.preprocess_data()
print_basic_info(data_interface.data)

In [None]:
data_interface.split_data()
data = data_interface.get_data()
X_train, y_train = data['train_data'].values, data['train_labels'].values
X_test, y_test = data['test_data'].values, data['test_labels'].values
X_valid, y_valid = data['val_data'].values, data['val_labels'].values

In [None]:
ccd = LogRegCCD(verbose=False)
ccd.fit(X_train, y_train, lam_max=0.5, lam_count=100)
y_pred = ccd.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train accuracy: {accuracy:.4f}")

In [None]:
ccd.plot_lasso_path()

In [None]:
print(ccd.best_beta)

In [None]:
ccd_coefs = ccd.best_beta
n_features = len(ccd_coefs)

indices = np.arange(n_features)
bar_width = 0.35

plt.figure(figsize=(14, 8))
plt.bar(indices + bar_width, ccd_coefs, bar_width)
plt.title('LogRegCCD Coefficients')
plt.show()

## Validation

In [None]:
ccd.plot(X_valid, y_valid, measure=measure.AUCROC())
ccd.validate(X_valid, y_valid, measure=measure.AUCROC())
y_pred = ccd.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)

print(f"Validation Accuracy: {accuracy:.4f}")
print(f"Best Beta: {ccd.best_beta}")

In [None]:
nonzero_elements = np.count_nonzero(ccd.best_beta)
total_features = len(ccd.best_beta)
reduced_features = total_features - nonzero_elements

print(f"Number of reduced features by regularization: {reduced_features}")

## Comparison with LogisticRegression

In [None]:
lr = LogisticRegression(penalty=None, max_iter=1000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Train Accuracy: {accuracy:.4f}")

In [None]:
print(lr.coef_)

In [None]:
y_pred = lr.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

In [None]:
ccd_metrics = evaluate_model(ccd, X_test, y_test, "LogRegCCD")
lr_metrics = evaluate_model(lr, X_test, y_test, "LogisticRegression")

results = pd.DataFrame({
    'Metric': ['ROC AUC', 'PR AUC', 'F1 Score', 'Balanced Accuracy'],
    'LogRegCCD': [ccd_metrics['ROC AUC'], ccd_metrics['PR AUC'],
                  ccd_metrics['F1 Score'], ccd_metrics['Balanced Accuracy']],
    'LogisticRegression': [lr_metrics['ROC AUC'], lr_metrics['PR AUC'],
                           lr_metrics['F1 Score'], lr_metrics['Balanced Accuracy']]
})

results.to_csv(f'./results/{DATASET_NAME}.csv', index=False)

print("\nPerformance Comparison:")
display(results)

In [None]:
lr_coefs = lr.coef_[0]
ccd_coefs = ccd.best_beta[1:]
n_features = len(lr_coefs)
indices = np.arange(n_features)
bar_width = 0.35

plt.figure(figsize=(14, 8))
plt.bar(indices, lr_coefs, bar_width, label='Logistic Regression Coefficients')
plt.bar(indices + bar_width, ccd_coefs, bar_width, label='LogRegCCD Coefficients')

plt.xlabel('Feature Index')
plt.ylabel('Coefficient Value')
plt.legend()
