# Baseline Classifiers for BF & ESM
## 10-Fold Cross-Validation of Protein Sequence Classifiers Using ESM & BF Features.

In [None]:
Posdata = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/MPFit/BF/BF_MP_selected.csv')
# Posdata = Posdata.iloc[:,2:]

Negdata = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/MPFit/BF/BF_Non_MP_selected.csv')
# Negdata = Negdata.iloc[:,2:]

Posdata['label'] = 1
Negdata['label'] = 0
combined_data = pd.concat([Posdata, Negdata], axis=0, ignore_index=True)

all_sequences = combined_data.drop('label', axis=1).values
all_labels = combined_data['label'].values
print(f"Total samples: {len(combined_data)}")
print(f"Positive samples (MP): {len(Posdata)}")
print(f"Negative samples (Non-MP): {len(Negdata)}")
print(f"Feature shape: {all_sequences.shape}")

In [None]:
X = all_sequences
y = all_labels

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/MP_Prediction_MB/MPFit/Tfidf/tfidf_kmer1_features.csv')

feature_columns = df.columns[3:]
X = df[feature_columns].values

y = df['label'].map({'MP': 1, 'Non-MP': 0}).values

processed_data = pd.DataFrame(X, columns=feature_columns)
processed_data['label'] = y

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # GBDT imported
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier # KNN imported


# data_combined_train = data_combined_train.drop(columns=['seq'])
# X = data_combined_train.drop('label', axis=1)
# y = data_combined_train['label']


# scaler = MinMaxScaler()
# X_scaled = scaler.fit_transform(X)

classifiers = [
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("SVM", SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)),
    ("Random Forest", RandomForestClassifier(n_estimators=500, random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(200, 45),
                         activation='relu',
                         solver='adam',
                         batch_size=128,
                         max_iter=200,
                         random_state=42)),
    ("Naive Bayes", GaussianNB()),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=5)),  # ADDED
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42)) # ADDED
]

results = []

print("--- Starting Classifier Evaluation ---")
# Loop through each classifier, train it, and evaluate its performance
for name, clf in classifiers:
    # Train the classifier on the entire scaled dataset
    clf.fit(X, y)

    y_pred = clf.predict(X)
    y_pred_proba = clf.predict_proba(X)[:, 1]

    # Calculate performance metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, zero_division=0)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    auroc = roc_auc_score(y, y_pred_proba)
    auprc = average_precision_score(y, y_pred_proba)

    # Store the results
    results.append({
        'Classifier': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1,
        'AUROC': auroc,
        'AUPRC': auprc
    })

    print(f"{name}:")
    print("-" * 50)
    print(f"Accuracy: {accuracy:.3f}")
    print(f"Precision: {precision:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"F1: {f1:.3f}")
    print(f"AUROC: {auroc:.3f}")
    print(f"AUPRC: {auprc:.3f}")
    print("=" * 80 + "\n")

In [None]:
df_ext11 = df_ext1[df_ext1['label']=='MP']
df_ext12 = df_ext1[df_ext1['label']=='Non-MP']

from sklearn.metrics import confusion_matrix, classification_report

trained_models = {}

results_external = []

for name, clf in classifiers:
    trained_models[name] = clf
    y_pred_test = clf.predict(X_ext)
    y_pred_proba_test = clf.predict_proba(X_ext)[:, 1]

    accuracy_test = accuracy_score(y_ext, y_pred_test)
    precision_test = precision_score(y_ext, y_pred_test, zero_division=0)
    recall_test = recall_score(y_ext, y_pred_test)
    f1_test = f1_score(y_ext, y_pred_test)
    auroc_test = roc_auc_score(y_ext, y_pred_proba_test)
    auprc_test = average_precision_score(y_ext, y_pred_proba_test)

    # Calculate confusion matrix
    cm = confusion_matrix(y_ext, y_pred_test)

    # For binary classification, extract TN, FP, FN, TP
    tn, fp, fn, tp = cm.ravel()

    results_external.append({
        'Classifier': name,
        'Accuracy_Test': accuracy_test,
        'Precision_Test': precision_test,
        'Recall_Test': recall_test,
        'F1_Test': f1_test,
        'AUROC_Test': auroc_test,
        'AUPRC_Test': auprc_test,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'TP': tp
    })

    print(f"{name} - External Test Results:")
    print("-" * 50)
    print(f"Accuracy: {accuracy_test:.3f}")
    print(f"Precision: {precision_test:.3f}")
    print(f"Recall: {recall_test:.3f}")
    print(f"F1: {f1_test:.3f}")
    print(f"AUROC: {auroc_test:.3f}")
    print(f"AUPRC: {auprc_test:.3f}")
    print("\nConfusion Matrix:")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    print("\nConfusion Matrix (array form):")
    print(cm)
    print("=" * 80 + "\n")

results_external_df = pd.DataFrame(results_external)
print("\nSummary of all classifiers on external test dataset:")
print(results_external_df.to_string(index=False))

results_external_df = pd.DataFrame(results_external)
print("\nSummary of all classifiers on external test dataset:")
print(results_external_df.to_string(index=False))

results_external_df.to_csv('classifier_results_external_test.csv', index=False)
print("\nExternal test results saved to 'classifier_results_external_test.csv'")


trained_models = {}
results_external = []

for name, clf in classifiers:
    trained_models[name] = clf
    y_pred_test = clf.predict(X_ext)
    y_pred_proba_test = clf.predict_proba(X_ext)[:, 1]

    accuracy_test = accuracy_score(y_ext, y_pred_test)
    precision_test = precision_score(y_ext, y_pred_test, zero_division=0)
    recall_test = recall_score(y_ext, y_pred_test)
    f1_test = f1_score(y_ext, y_pred_test)
    auroc_test = roc_auc_score(y_ext, y_pred_proba_test)
    auprc_test = average_precision_score(y_ext, y_pred_proba_test)

    # Calculate confusion matrix
    cm = confusion_matrix(y_ext, y_pred_test)

    # For binary classification, extract TN, FP, FN, TP
    tn, fp, fn, tp = cm.ravel()

    results_external.append({
        'Classifier': name,
        'Accuracy_Test': accuracy_test,
        'Precision_Test': precision_test,
        'Recall_Test': recall_test,
        'F1_Test': f1_test,
        'AUROC_Test': auroc_test,
        'AUPRC_Test': auprc_test,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'TP': tp
    })

    print(f"{name} - External Test Results:")
    print("-" * 50)
    print(f"Accuracy: {accuracy_test:.3f}")
    print(f"Precision: {precision_test:.3f}")
    print(f"Recall: {recall_test:.3f}")
    print(f"F1: {f1_test:.3f}")
    print(f"AUROC: {auroc_test:.3f}")
    print(f"AUPRC: {auprc_test:.3f}")
    print("\nConfusion Matrix:")
    print(f"True Negatives: {tn}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"True Positives: {tp}")
    print("\nConfusion Matrix (array form):")
    print(cm)
    print("=" * 80 + "\n")

results_external_df = pd.DataFrame(results_external)
print("\nSummary of all classifiers on external test dataset:")
print(results_external_df.to_string(index=False))

results_external_df.to_csv('classifier_results_external_test.csv', index=False)
print("\nExternal test results saved to 'classifier_results_external_test.csv'")