Import Libraries and Load Data

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from scipy.stats import randint, uniform
from collections import Counter
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

Load the dataset and perform initial preprocessing by dropping non-relevant columns and separating features and the target variable.

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('aggregated_plants.csv')

# Dropping non-relevant columns
df = df.drop(columns=['Plant_Species', 'Reference', 'BitInterpretations'])

# Handle missing values if necessary
df = df.dropna()

# Separating features and target
features = df.drop('Salmonella Typhimurium', axis=1)
target = df['Salmonella Typhimurium']

# Verify that all features are binary
print("Feature value counts (first 5 features):")
print(features.iloc[:, :5].apply(pd.value_counts))

# Assign features and target to X and y
X = features
y = target

print(f"Total dataset size: {X.shape[0]} samples")


Feature value counts (first 5 features):
   Bit_1  Bit_2  Bit_4  Bit_8  Bit_9
0    106    170    170    170    170
1     65      1      1      1      1
Total dataset size: 171 samples


In [169]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, matthews_corrcoef,
    f1_score, make_scorer, confusion_matrix, roc_auc_score, classification_report
)
import numpy as np
import pandas as pd

# Define custom scorers
def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fp = cm[0, 1]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def npv_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fn = cm[1, 0]
    return tn / (tn + fn) if (tn + fn) > 0 else 0

# Create scorers dictionary
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': 'roc_auc',
    'sensitivity': make_scorer(recall_score, zero_division=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, zero_division=0)
}

# Initialize the estimator with regularization parameter C
estimator = LogisticRegression(penalty='l1', solver='saga', C=0.85, max_iter=5000, random_state=0)

# Set up the stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Perform cross-validation on the entire dataset
cv_results = cross_validate(
    estimator, X, y,
    cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True
)

# Output cross-validation metrics
print("\nCross-validation metrics for Logistic Regression on all features:")
for scorer in scoring.keys():
    mean_train_score = np.mean(cv_results['train_' + scorer])
    std_train_score = np.std(cv_results['train_' + scorer])
    mean_test_score = np.mean(cv_results['test_' + scorer])
    std_test_score = np.std(cv_results['test_' + scorer])
    print(f"  {scorer}:")
    print(f"    Training: {mean_train_score:.4f} (+/- {std_train_score:.4f})")
    print(f"    Testing: {mean_test_score:.4f} (+/- {std_test_score:.4f})")



Cross-validation metrics for Logistic Regression on all features:
  accuracy:
    Training: 0.8597 (+/- 0.0132)
    Testing: 0.7896 (+/- 0.0726)
  roc_auc:
    Training: 0.9412 (+/- 0.0072)
    Testing: 0.8173 (+/- 0.0577)
  sensitivity:
    Training: 0.8305 (+/- 0.0210)
    Testing: 0.7438 (+/- 0.1442)
  specificity:
    Training: 0.8899 (+/- 0.0069)
    Testing: 0.8338 (+/- 0.1357)
  precision:
    Training: 0.8864 (+/- 0.0087)
    Testing: 0.8458 (+/- 0.1135)
  npv:
    Training: 0.8355 (+/- 0.0177)
    Testing: 0.7745 (+/- 0.0899)
  mcc:
    Training: 0.7211 (+/- 0.0256)
    Testing: 0.5982 (+/- 0.1434)
  f1:
    Training: 0.8575 (+/- 0.0146)
    Testing: 0.7766 (+/- 0.0843)


In [46]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, matthews_corrcoef,
    f1_score, make_scorer, confusion_matrix, roc_auc_score, classification_report
)
import numpy as np
import pandas as pd

# Define custom scorers
def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fp = cm[0, 1]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def npv_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fn = cm[1, 0]
    return tn / (tn + fn) if (tn + fn) > 0 else 0

# Create scorers dictionary
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': 'roc_auc',
    'sensitivity': make_scorer(recall_score, zero_division=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, zero_division=0)
}

# Initialize the estimator
estimator = XGBClassifier(use_label_encoder=False, eval_metric='logloss', max_depth=2, n_estimators=50, random_state=0)

# Set up the stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Perform cross-validation on the entire dataset
cv_results = cross_validate(
    estimator, X, y,
    cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True
)

# Output cross-validation metrics
print("\nCross-validation metrics for XGBoost on all features:")
for scorer in scoring.keys():
    mean_train_score = np.mean(cv_results['train_' + scorer])
    std_train_score = np.std(cv_results['train_' + scorer])
    mean_test_score = np.mean(cv_results['test_' + scorer])
    std_test_score = np.std(cv_results['test_' + scorer])
    print(f"  {scorer}:")
    print(f"    Training: {mean_train_score:.4f} (+/- {std_train_score:.4f})")
    print(f"    Testing: {mean_test_score:.4f} (+/- {std_test_score:.4f})")



Cross-validation metrics for XGBoost on all features:
  accuracy:
    Training: 0.8933 (+/- 0.0151)
    Testing: 0.7839 (+/- 0.0277)
  roc_auc:
    Training: 0.9685 (+/- 0.0012)
    Testing: 0.8153 (+/- 0.0381)
  sensitivity:
    Training: 0.8966 (+/- 0.0385)
    Testing: 0.7562 (+/- 0.0974)
  specificity:
    Training: 0.8899 (+/- 0.0239)
    Testing: 0.8096 (+/- 0.1198)
  precision:
    Training: 0.8947 (+/- 0.0193)
    Testing: 0.8209 (+/- 0.0717)
  npv:
    Training: 0.8944 (+/- 0.0335)
    Testing: 0.7730 (+/- 0.0464)
  mcc:
    Training: 0.7878 (+/- 0.0295)
    Testing: 0.5795 (+/- 0.0509)
  f1:
    Training: 0.8949 (+/- 0.0169)
    Testing: 0.7787 (+/- 0.0333)


In [66]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, matthews_corrcoef,
    f1_score, make_scorer, confusion_matrix, roc_auc_score, classification_report
)
import numpy as np
import pandas as pd

# Define custom scorers
def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fp = cm[0, 1]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def npv_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fn = cm[1, 0]
    return tn / (tn + fn) if (tn + fn) > 0 else 0

# Create scorers dictionary
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': 'roc_auc',
    'sensitivity': make_scorer(recall_score, zero_division=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, zero_division=0)
}

# Initialize the estimator with regularization parameter C
estimator = LinearSVC(penalty='l1', loss='squared_hinge', C=0.5, max_iter=5000, random_state=0)

# Set up the stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Perform cross-validation on the entire dataset
cv_results = cross_validate(
    estimator, X, y,
    cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True
)

# Output cross-validation metrics
print("\nCross-validation metrics for Linear SVM on all features:")
for scorer in scoring.keys():
    mean_train_score = np.mean(cv_results['train_' + scorer])
    std_train_score = np.std(cv_results['train_' + scorer])
    mean_test_score = np.mean(cv_results['test_' + scorer])
    std_test_score = np.std(cv_results['test_' + scorer])
    print(f"  {scorer}:")
    print(f"    Training: {mean_train_score:.4f} (+/- {std_train_score:.4f})")
    print(f"    Testing: {mean_test_score:.4f} (+/- {std_test_score:.4f})")



Cross-validation metrics for Linear SVM on all features:
  accuracy:
    Training: 0.9299 (+/- 0.0169)
    Testing: 0.7313 (+/- 0.0366)
  roc_auc:
    Training: 0.9809 (+/- 0.0051)
    Testing: 0.7795 (+/- 0.0439)
  sensitivity:
    Training: 0.9255 (+/- 0.0342)
    Testing: 0.6987 (+/- 0.1666)
  specificity:
    Training: 0.9345 (+/- 0.0225)
    Testing: 0.7625 (+/- 0.1738)
  precision:
    Training: 0.9367 (+/- 0.0192)
    Testing: 0.7893 (+/- 0.1036)
  npv:
    Training: 0.9249 (+/- 0.0331)
    Testing: 0.7309 (+/- 0.0698)
  mcc:
    Training: 0.8608 (+/- 0.0340)
    Testing: 0.4893 (+/- 0.0675)
  f1:
    Training: 0.9305 (+/- 0.0170)
    Testing: 0.7166 (+/- 0.0674)


In [142]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, matthews_corrcoef,
    f1_score, make_scorer, confusion_matrix, roc_auc_score, classification_report
)
import numpy as np
import pandas as pd

# Define custom scorers
def specificity_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fp = cm[0, 1]
    return tn / (tn + fp) if (tn + fp) > 0 else 0

def npv_score(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm[0, 0]
    fn = cm[1, 0]
    return tn / (tn + fn) if (tn + fn) > 0 else 0

# Create scorers dictionary
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'roc_auc': 'roc_auc',
    'sensitivity': make_scorer(recall_score, zero_division=0),
    'specificity': make_scorer(specificity_score),
    'precision': make_scorer(precision_score, zero_division=0),
    'npv': make_scorer(npv_score),
    'mcc': make_scorer(matthews_corrcoef),
    'f1': make_scorer(f1_score, zero_division=0)
}

# Initialize the estimator with regularization parameters suitable for small datasets
estimator = RandomForestClassifier(
    n_estimators=150,
    max_depth=6,
    #min_samples_split=4,
    #min_samples_leaf=2,
    #max_features='log2',
    #bootstrap=True,
    #oob_score=True,
    #n_jobs=-1,
    #verbose=0,
    #warm_start=False,
    #class_weight=None,
    #ccp_alpha=0.0,
    #max_samples=None,
    random_state=0
)

# Set up the stratified k-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# Perform cross-validation on the entire dataset
cv_results = cross_validate(
    estimator, X, y,
    cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True
)

# Output cross-validation metrics
print("\nCross-validation metrics for Random Forest on all features:")
for scorer in scoring.keys():
    mean_train_score = np.mean(cv_results['train_' + scorer])
    std_train_score = np.std(cv_results['train_' + scorer])
    mean_test_score = np.mean(cv_results['test_' + scorer])
    std_test_score = np.std(cv_results['test_' + scorer])
    print(f"  {scorer}:")
    print(f"    Training: {mean_train_score:.4f} (+/- {std_train_score:.4f})")
    print(f"    Testing: {mean_test_score:.4f} (+/- {std_test_score:.4f})")



Cross-validation metrics for Random Forest on all features:
  accuracy:
    Training: 0.8875 (+/- 0.0148)
    Testing: 0.7776 (+/- 0.0580)
  roc_auc:
    Training: 0.9715 (+/- 0.0042)
    Testing: 0.8356 (+/- 0.0502)
  sensitivity:
    Training: 0.8679 (+/- 0.0429)
    Testing: 0.7092 (+/- 0.1271)
  specificity:
    Training: 0.9077 (+/- 0.0176)
    Testing: 0.8456 (+/- 0.1021)
  precision:
    Training: 0.9075 (+/- 0.0123)
    Testing: 0.8409 (+/- 0.0993)
  npv:
    Training: 0.8711 (+/- 0.0367)
    Testing: 0.7507 (+/- 0.0773)
  mcc:
    Training: 0.7771 (+/- 0.0287)
    Testing: 0.5725 (+/- 0.1193)
  f1:
    Training: 0.8865 (+/- 0.0178)
    Testing: 0.7594 (+/- 0.0740)
