In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

In [37]:
df = pd.read_excel('merged_df.xlsx')

In [38]:
df.head()

Unnamed: 0,TSH,FTI,TT4,T3,query hypothyroid,on thyroxine,sex,pregnant,T4U,psych,goitre,on antithyroid medication,thyroid surgery,query hyperthyroid,query on thyroxine,label
0,1.895,95.3,158.2,3.955556,0.1,0.0,0.0,0.2,1.662,0.0,0.0,0.0,0.0,0.3,0.0,1
1,2.795882,102.923077,166.666667,2.994118,0.0,0.277778,0.166667,0.166667,1.606923,0.055556,0.0,0.055556,0.0,0.055556,0.055556,1
2,1.436667,127.714286,178.714286,2.7,0.142857,0.571429,0.0,0.0,1.4,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1.533636,118.0,169.5,2.79,0.0,0.333333,0.1,0.166667,1.439167,0.0,0.0,0.0,0.0,0.083333,0.0,1
4,1.192619,125.761905,150.047619,2.36,0.047619,0.142857,0.238095,0.0,1.193333,0.238095,0.0,0.0,0.047619,0.047619,0.0,1


In [39]:
X = df.drop(columns=['label'])
y = df['label']

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [41]:
def build_basis(X):
    ones = np.ones((X.shape[0],1))
    abs_X = np.abs(X)
    Phi = np.hstack([ones, abs_X])
    return Phi

In [42]:
Phi_train = build_basis(X_train)
Phi_test = build_basis(X_test)

In [44]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import time

# ------------------------------------------------------
# 1. Load / preprocess dataset
# ------------------------------------------------------

# df should be your final processed DataFrame with features + target
# Example: df has 15 features + 1 target = 16 columns

# Replace missing values with 0 (paper's behavior)
df = df.copy()
df = df.replace({np.nan: 0})

X = df.drop('label', axis=1).values   # shape (n_samples, n_features)
y = df['label'].values                # shape (n_samples,)

# ------------------------------------------------------
# 2. Train/test split
# ------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ------------------------------------------------------
# 3. Build basis matrix Î¦ = [1, |x|]
# ------------------------------------------------------

def build_basis(X):
    ones = np.ones((X.shape[0], 1))
    abs_X = np.abs(X)
    Phi = np.hstack([ones, abs_X])     # shape = (n_samples, 1 + n_features)
    return Phi

Phi_train = build_basis(X_train)
Phi_test = build_basis(X_test)

# ------------------------------------------------------
# 4. Train BLS using whole-batch pseudoinverse
# ------------------------------------------------------

# w = (Phi_train^T Phi_train)^(-1) Phi_train^T y
t0 = time.time()
w = np.linalg.pinv(Phi_train.T @ Phi_train) @ (Phi_train.T @ y_train)
t1 = time.time()

# ------------------------------------------------------
# 5. Evaluate on test set
# ------------------------------------------------------
y_pred_train_cont = Phi_train @ w
y_pred_train = (y_pred_train_cont >= 0.5).astype(int)

acc = accuracy_score(y_train, y_pred_train)
prec = precision_score(y_train, y_pred_train, zero_division=0)
rec = recall_score(y_train, y_pred_train, zero_division=0)
f1 = f1_score(y_train, y_pred_train, zero_division=0)
cm = confusion_matrix(y_train, y_pred_train)

print("========= BLS Results (Train Set) =========")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_train, y_pred_train, target_names=['Benign', 'Malignant'], zero_division=0))

y_pred_cont = Phi_test @ w                # continuous predictions
y_pred = (y_pred_cont >= 0.5).astype(int) # convert to binary

acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred, zero_division=0)
rec = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("========= BLS Results (Test Set) =========")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malignant'], zero_division=0))
print(f"BLS training time: {t1 - t0:.8f} seconds")


Accuracy:  0.9250
Precision: 0.9381
Recall:    0.9681
F1 Score:  0.9529

Confusion Matrix:
[[20  6]
 [ 3 91]]

Classification Report:
              precision    recall  f1-score   support

      Benign       0.87      0.77      0.82        26
   Malignant       0.94      0.97      0.95        94

    accuracy                           0.93       120
   macro avg       0.90      0.87      0.88       120
weighted avg       0.92      0.93      0.92       120

Accuracy:  0.8667
Precision: 0.9167
Recall:    0.9167
F1 Score:  0.9167

Confusion Matrix:
[[ 4  2]
 [ 2 22]]

Classification Report:
              precision    recall  f1-score   support

      Benign       0.67      0.67      0.67         6
   Malignant       0.92      0.92      0.92        24

    accuracy                           0.87        30
   macro avg       0.79      0.79      0.79        30
weighted avg       0.87      0.87      0.87        30

BLS training time: 0.00127006 seconds
