In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
from model import LogRegCCD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, recall_score, f1_score, roc_auc_score, balanced_accuracy_score,precision_score
from collections import defaultdict

In [2]:
def sparse_matrix_to_df(filename):
  data = defaultdict(dict)
  with open(filename, 'r') as f:
      for row_idx, line in enumerate(f):
          for item in line.strip().split():
              col_idx, value = item.split(':')
              data[row_idx][int(col_idx)] = float(value)

  df = pd.DataFrame.from_dict(data, orient='index').fillna(0)
  return df

In [3]:
dexter_train = sparse_matrix_to_df('DEXTER/dexter_train.data')
dexter_train_y = pd.read_csv('DEXTER/dexter_train.labels', header=None)
dexter_train_y = dexter_train_y.squeeze()
dexter_valid =sparse_matrix_to_df('DEXTER/dexter_valid.data')
dexter_valid_y = pd.read_csv('DEXTER/dexter_valid.labels', header=None)
dexter_valid_y = dexter_valid_y.squeeze()
mapping = {1: 0, -1: 1}
dexter_train_y = dexter_train_y.map(mapping).to_numpy()
dexter_valid_y = dexter_valid_y.map(mapping).to_numpy()

In [None]:
lambdas = np.linspace(10, 0.01, 10) 
model = LogRegCCD(lambdas)
model.fit(dexter_train, dexter_train_y)
model.validate(dexter_valid, dexter_valid_y)

In [None]:
print(model.best_lambda_)
model.plot(dexter_valid, dexter_valid_y, measure="roc_auc")
model.plot_coefficients()

In [None]:
model.validate(dexter_valid, dexter_valid_y, measure="balanced_accuracy")
model.plot(dexter_valid, dexter_valid_y, measure="balanced_accuracy")
model.plot_coefficients()

In [None]:
model.validate(dexter_valid, dexter_valid_y, measure="recall")
model.plot(dexter_valid, dexter_valid_y, measure="recall")
model.plot_coefficients()

In [None]:
model.validate(dexter_valid, dexter_valid_y, measure="precision")
model.plot(dexter_valid, dexter_valid_y, measure="precision")
model.plot_coefficients()

In [None]:
model.validate(dexter_valid, dexter_valid_y, measure="f_measure")
model.plot(dexter_valid, dexter_valid_y, measure="f_measure")
model.plot_coefficients()

In [None]:
modelLR = LogisticRegression(penalty=None)
modelLR.fit(dexter_train, dexter_train_y)

In [None]:
y_pred = modelLR.predict(dexter_valid)
y_prob = modelLR.predict_proba(dexter_valid)[:, 1] 

balanced_acc = balanced_accuracy_score(dexter_valid_y, y_pred)
recall = recall_score(dexter_valid_y, y_pred, average='binary')
f1 = f1_score(dexter_valid_y, y_pred, average='binary')
roc_auc = roc_auc_score(dexter_valid_y, y_prob)
precision = precision_score(dexter_valid_y, y_pred, average='binary')

metrics = ['Balanced Accuracy', 'Recall', 'F1-Score', 'Roc-Auc', 'Precision']
values = [balanced_acc, recall, f1, roc_auc, precision]

plt.figure(figsize=(8, 6))
plt.bar(metrics, values, color=['skyblue', 'lightgreen', 'salmon', 'pink', 'purple'])
plt.xlabel('Metrics')
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.ylim([0, 1])  
plt.yticks(np.linspace(0, 1, 22))  

plt.show()