In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, confusion_matrix

def evaluate_model(X, y, scenario_name):
    """
    Splits the data, normalizes it, trains the logistic regression model,
    and prints performance metrics.
    """
    # Split into 70% training and 30% testing
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
    
    # Normalize features using StandardScaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Fit logistic regression model (increased max_iter to ensure convergence)
    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)
    
    # Obtain predictions and class probabilities
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    # Extract TN, FP, FN, TP from the confusion matrix
    tn, fp, fn, tp = cm.ravel()

    tpr = tp / (tp + fn) if (tp + fn) else 0
    fnr = fn / (tp + fn) if (tp + fn) else 0
    fpr = fp / (tn + fp) if (tn + fp) else 0 
    tnr = tn / (tn + fp) if (tn + fp) else 0 
    
    # Print results
    print(f"Scenario: {scenario_name}")
    print(f"Classification test set: [{len(y_test)}] observations, accuracy = {accuracy:.4f}, AUC = {auc:.4f}")
    print(f"Precision = {precision:.6f}, Recall = {recall:.6f}")
    print(cm)
    print(f"TPR = {tpr:.4f}, FNR = {fnr:.4f}, FPR = {fpr:.4f}, TNR = {tnr:.4f}\n")

In [6]:
def main():
    # Step 1: Read in the data
    data = pd.read_excel("VWXYZ.xlsx")  # Adjust the path if needed.
    
    # Extract predictors and target variable;
    # Assumes that columns are labeled 'V', 'W', 'X', 'Y', 'Z' and 'Binary'.
    X_orig = data[['V', 'W', 'X', 'Y', 'Z']]
    y = data['Binary']
    
    # Scenario 1: Original data only
    X1 = X_orig.copy()
    evaluate_model(X1, y, "Original Data")
    
    # Scenario 2: Adding polynomial features (degree 2 example)
    poly = PolynomialFeatures(degree=2, include_bias=False)
    X_poly = poly.fit_transform(X_orig)
    evaluate_model(X_poly, y, "Polynomial Features (degree 2)")
    
    # Scenario 3: Adding log features
    # Use np.log1p to safely transform the features (works with zeros too).
    X_log = np.log1p(X_orig)
    evaluate_model(X_log, y, "Log Features")
    
    # Scenario 4: Adding both polynomial and log features
    # Create polynomial features and log features, then concatenate them.
    X_poly = poly.fit_transform(X_orig)
    X_log = np.log1p(X_orig)
    X_both = np.concatenate((X_poly, X_log), axis=1)
    evaluate_model(X_both, y, "Polynomial and Log Features")

In [7]:
main()

Scenario: Original Data
Classification test set: [30000] observations, accuracy = 0.8449, AUC = 0.9254
Precision = 0.847410, Recall = 0.850387
[[12273  2354]
 [ 2300 13073]]
TPR = 0.8504, FNR = 0.1496, FPR = 0.1609, TNR = 0.8391

Scenario: Polynomial Features (degree 2)
Classification test set: [30000] observations, accuracy = 0.8468, AUC = 0.9260
Precision = 0.850426, Recall = 0.850647
[[12327  2300]
 [ 2296 13077]]
TPR = 0.8506, FNR = 0.1494, FPR = 0.1572, TNR = 0.8428

Scenario: Log Features
Classification test set: [30000] observations, accuracy = 0.8442, AUC = 0.9236
Precision = 0.869730, Recall = 0.818643
[[12742  1885]
 [ 2788 12585]]
TPR = 0.8186, FNR = 0.1814, FPR = 0.1289, TNR = 0.8711

Scenario: Polynomial and Log Features
Classification test set: [30000] observations, accuracy = 0.8460, AUC = 0.9260
Precision = 0.848748, Recall = 0.851233
[[12295  2332]
 [ 2287 13086]]
TPR = 0.8512, FNR = 0.1488, FPR = 0.1594, TNR = 0.8406

