In [138]:
import numpy as np
import pandas as pd
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt

def generate_synth_dataset(p: float, n: int, d: int, g: float)->pd.DataFrame:
    """Generating a synthetic dataset
    Args:
        p (float): class prior probability
        n (int): number of observations
        d (int): number of features (dimensions)
        g (float): correlation between features
    Returns:
        Datadrame: dataset with d features, n rows
    """
    Y = np.random.binomial(1, p, size=n)
    S = np.array([[g ** abs(i - j) for j in range(d)] for i in range(d)])
    
    mean_0 = np.zeros(d)
    mean_1 = np.array([1/(i+1) for i in range(d)])
    
    X = np.array([
        multivariate_normal.rvs(mean=mean_1 if y == 1 else mean_0, cov=S)
        for y in Y
    ])
    
    feature_names = [f'f{i+1}' for i in range(d)]
    dataset = pd.DataFrame(X, columns=feature_names)
    dataset['Y'] = Y
    
    return dataset


In [139]:
from sklearn.linear_model import LogisticRegression

ban = LogisticRegression()

In [140]:
p = 0.5   # probability of class = 1
n = 200   # number of observations
d = 10    # number of features
g = 0.5   # correlation between features

dataset = generate_synth_dataset(p, n, d, g)
print(dataset.head())

         f1        f2        f3        f4        f5        f6        f7  \
0  1.076153  0.620775  0.407769  1.385197  0.300893  0.514473 -0.617612   
1 -2.671350 -1.562934 -1.138147 -0.080313 -0.041001 -0.180056  0.448981   
2 -1.312945 -0.609487 -0.864128  0.566632  0.592246  0.072353 -0.197144   
3  1.375741  1.408673  1.609643  1.116756  1.338585  0.186409  0.710947   
4 -0.193549 -0.863002  0.518264  0.372438 -0.862466 -0.836251  0.272078   

         f8        f9       f10  Y  
0 -0.253329  2.501031  0.852009  1  
1  0.556238  0.655444  0.219245  0  
2  0.264734 -0.534188  0.289655  1  
3  0.283617  0.961569  0.107822  1  
4  0.995409  1.038346  1.609608  0  


In [141]:
from sklearn.model_selection import train_test_split

feature_cols =['f1','f2','f3','f4','f5','f6','f7','f8', 'f9','f10']
y = dataset.Y
X = dataset[feature_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2


In [142]:
β = np.zeros(X_train.shape[1])
xi = X_train.iloc[0].values
print(xi)
print(β)
np.dot(xi, β)


[ 1.87953147  1.68376044  0.11371619  0.23965426  0.03101174  2.03830738
 -0.43884787  0.84075436  0.12560052  1.13674559]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


np.float64(0.0)

In [143]:
np.array(y_train)

array([1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0])

In [144]:
from sklearn.metrics import roc_auc_score, average_precision_score, recall_score, precision_score, f1_score, balanced_accuracy_score

class LogRegCCD:
    def __init__(self, lambdas):
        self.lambdas = lambdas
        self.coef_path_ = []
        self.intercept_path_ = []
        self.best_lambda_ = None
        self.best_coef_ = None
        self.best_intercept_ = None

    def fit(self, X_train, y_train, alpha=1.0, tol=1e-8, max_iter=100):
        """Train logistic regression with CCD"""
        X = np.array(X_train)
        y = np.array(y_train)
        N, d = X.shape

        for l in self.lambdas:  # Iterate through each lambda
            b = np.zeros(d)  # Initialize weights to zero
            b_0 = 0  # Initialize bias (intercept)
            
            for iteration in range(max_iter):
                b_old = b.copy()
                b0_old = b_0
                
                # Compute probability predictions
                linear_comb = b_0 + X @ b
                p = 1 / (1 + np.exp(-linear_comb))  # Sigmoid function
                
                # Update each coordinate j independently
                for j in range(d):
                    X_j = X[:, j]  # Extract feature column
                    numerator = np.sum(X_j * (y - p))  # ∑ x_ij (y_i - p_i)
                    denominator = np.sum(X_j**2)  # ∑ x_ij^2
                    
                    denominator = max(denominator, 1e-10)  # Prevent division by zero
                    
                    # Compute weight update using soft-thresholding
                    b[j] = self.soft_threshold(numerator / denominator, l * alpha)

                # Update bias separately (since it’s not regularized)
                b_0 = np.mean(y - p)  # b_0 = (1/n) ∑ (y_i - p_i)

                # Check for convergence
                if np.max(np.abs(b - b_old)) < tol and np.abs(b_0 - b0_old) < tol:
                    break

            # Store the solution path
            self.coef_path_.append(b.copy())
            self.intercept_path_.append(b_0)

    def validate(self, X_valid, y_valid, measure='roc_auc'):
        best_score = -np.inf
        best_lambda = None
        best_index = None

        for i, l in enumerate(self.lambdas):
          b = self.coef_path_[i]
          b_0 = self.intercept_path_[i]
          probas = 1 / (1 + np.exp(-(b_0 + X_valid @ b)))
          if measure in ['recall', 'precision', 'f_measure', 'balanced_accuracy']:
            predictions = (probas >= 0.5).astype(int)
            score = self.compute_measure(y_valid, predictions, measure)
          elif measure == 'roc_auc':
            score = roc_auc_score(y_valid, probas)
          elif measure == 'sensitivity_precision_auc':
              score = average_precision_score(y_valid, probas)

          if score > best_score:
              best_score = score
              best_lambda = l
              best_index = i
        self.best_lambda_ = best_lambda
        self.best_coef_ = self.coef_path_[best_index]
        self.best_intercept_ = self.intercept_path_[best_index]

    def predict_proba(self, X_test):
        return 1 / (1 + np.exp(-(self.best_intercept_ + X_test @ self.best_coef_)))

    def plot(self, X_valid, y_valid, measure='roc_auc'):
        """Plot performance measure vs lambda"""
        scores = []

        for i, l in enumerate(self.lambdas):
            b = self.coef_path_[i]
            b_0 = self.intercept_path_[i]
            probas = 1 / (1 + np.exp(-(b_0 + X_valid @ b)))

            if measure in ['recall', 'precision', 'f_measure', 'balanced_accuracy']:
                predictions = (probas >= 0.5).astype(int)
                score = self.compute_measure(y_valid, predictions, measure)
            elif measure == 'roc_auc':
                score = roc_auc_score(y_valid, probas)
            elif measure == 'sensitivity_precision_auc':
                score = average_precision_score(y_valid, probas)

            scores.append(score)

        plt.figure(figsize=(8, 5))
        plt.plot(self.lambdas, scores, marker='o', linestyle='-')
        plt.xscale("log")
        plt.xlabel("Lambda")
        plt.ylabel(measure)
        plt.title(f"Performance Measure ({measure}) vs Lambda")
        plt.grid(True)
        plt.show()

    def plot_coefficients(self):
        """Plot coefficient paths vs lambda"""
        coef_paths = np.array(self.coef_path_).T

        plt.figure(figsize=(10, 6))
        for coef in coef_paths:
            plt.plot(self.lambdas, coef, linestyle='-', marker='.')
        plt.xscale("log")
        plt.xlabel("Lambda")
        plt.ylabel("Coefficient Value")
        plt.title("Coefficient Paths vs Lambda")
        plt.grid(True)
        plt.show()    

    def soft_threshold(self, z, gamma):
        if z > gamma:
            return z - gamma
        elif z < -gamma:
            return z + gamma
        else:
            return 0

    def compute_measure(self, y_true, y_pred, measure):
      if measure == "recall":
          return recall_score(y_true, y_pred)
      elif measure == "precision":
          return precision_score(y_true, y_pred)
      elif measure == "f_measure":
          return f1_score(y_true, y_pred)
      elif measure == "balanced_accuracy":
          return balanced_accuracy_score(y_true, y_pred)
      else:
          raise ValueError(f"Unknown measure: {measure}")

In [145]:
lambdas = np.linspace(0.1, 0.003, 50)  # explicitly 100 lambda values from 0.001 to 0.000001
model = LogRegCCD(lambdas=lambdas)
model.fit(X_train, y_train)
model.validate(X_val, y_val)
# print(model.intercept_path_)
# Validate and select best lambda
model.plot(X_val, y_val, measure="precision")
model.plot_coefficients()

TypeError: 'float' object is not callable