# Baseline Classifiers for Tf-Idf
## 10-Fold Cross-Validation of Protein Sequence Classifiers Using 2-mer TF–IDF Features. 

In [None]:
import numpy as np
from google.colab import drive
from sklearn.model_selection import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import random
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
from tensorflow import keras
import tensorflow as tf
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from tensorflow.keras import layers
from tensorflow.keras.callbacks import  History
from tensorflow.keras.layers import Input
from tensorflow.keras import backend as K
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.utils import plot_model
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.activations import swish
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, roc_curve, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

classifiers = [
    ("Decision Tree", DecisionTreeClassifier(random_state=42)),
    ("SVM", SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=42)),
    ("Random Forest", RandomForestClassifier(n_estimators=500, random_state=42)),
    ("MLP", MLPClassifier(hidden_layer_sizes=(200, 45),
                         activation='relu',
                         solver='adam',
                         batch_size=128,
                         max_iter=200,
                         random_state=42)),
    ("Naive Bayes", GaussianNB()),
    ("Logistic Regression", LogisticRegression(max_iter=1000, random_state=42)),
    ("K-Nearest Neighbors", KNeighborsClassifier(n_neighbors=5)),
    ("Gradient Boosting", GradientBoostingClassifier(random_state=42))
]

def compute_tfidf_for_fold(train_sequences, test_sequences, kmer_length=2, predefined_aa=list('ACDEFGHIKLMNPQRSTUVWY')):
    feature_names = [''.join(km) for km in product(predefined_aa, repeat=kmer_length)]
    M = len(feature_names)
    kmer_to_idx = {kmer: idx for idx, kmer in enumerate(feature_names)}

    N_train = len(train_sequences)
    df_vector = np.zeros(M)

    for seq in train_sequences:
        kmers_in_seq = [seq[i:i + kmer_length] for i in range(len(seq) - kmer_length + 1)]
        unique_kmers_in_seq = set(kmers_in_seq) & set(feature_names)
        for kmer in unique_kmers_in_seq:
            df_vector[kmer_to_idx[kmer]] += 1

    idf_vector = np.log((N_train + 1) / (df_vector + 1))

    tfidf_train = np.zeros((len(train_sequences), M))
    for seq_idx, seq in enumerate(train_sequences):
        kmers = [seq[i:i + kmer_length] for i in range(len(seq) - kmer_length + 1)]
        tf_counts = Counter([k for k in kmers if k in kmer_to_idx])
        tf_vector = np.array([tf_counts.get(feature_names[j], 0) for j in range(M)])
        raw_tfidf = tf_vector * idf_vector
        norm = np.linalg.norm(raw_tfidf, ord=2)
        if norm > 0:
            tfidf_train[seq_idx] = raw_tfidf / norm
        else:
            tfidf_train[seq_idx] = raw_tfidf

    tfidf_test = np.zeros((len(test_sequences), M))
    for seq_idx, seq in enumerate(test_sequences):
        kmers = [seq[i:i + kmer_length] for i in range(len(seq) - kmer_length + 1)]
        tf_counts = Counter([k for k in kmers if k in kmer_to_idx])
        tf_vector = np.array([tf_counts.get(feature_names[j], 0) for j in range(M)])
        raw_tfidf = tf_vector * idf_vector
        norm = np.linalg.norm(raw_tfidf, ord=2)
        if norm > 0:
            tfidf_test[seq_idx] = raw_tfidf / norm
        else:
            tfidf_test[seq_idx] = raw_tfidf

    # CORRECTED: Return tfidf_test instead of tf_test
    return tfidf_train, tfidf_test, feature_names

my_amino_acids = list('ACDEFGHIKLMNPQRSTUVWY')

sequenceMP_ids, sequencesMP = read_sequences_from_csv("/content/drive/MyDrive/MP_Prediction_MB/MPFit/MP_clean.csv")
sequenceNonMP_ids, sequencesNonMP = read_sequences_from_csv("/content/drive/MyDrive/MP_Prediction_MB/MPFit/Non_MP_clean.csv")

all_sequences = sequencesMP + sequencesNonMP
all_labels = [1] * len(sequencesMP) + [0] * len(sequencesNonMP)

all_sequences = np.array(all_sequences)
all_labels = np.array(all_labels)

results = []

for fold, (train_idx, test_idx) in enumerate(kf.split(all_sequences)):
    print(f"\n=== Fold {fold + 1}/10 ===")

    X_train_fold, X_test_fold = all_sequences[train_idx], all_sequences[test_idx]
    y_train_fold, y_test_fold = all_labels[train_idx], all_labels[test_idx]

    print(f"Training samples: {len(X_train_fold)}, Test samples: {len(X_test_fold)}")

    X_train_tfidf, X_test_tfidf, feature_names = compute_tfidf_for_fold(
        X_train_fold, X_test_fold, kmer_length=2, predefined_aa=my_amino_acids
    )

    print(f"TF-IDF features computed. Training shape: {X_train_tfidf.shape}, Test shape: {X_test_tfidf.shape}")

    for clf_name, clf in classifiers:
        # Train the classifier
        clf.fit(X_train_tfidf, y_train_fold)

        # Get predictions and probabilities
        y_pred = clf.predict(X_test_tfidf)
        y_pred_proba = clf.predict_proba(X_test_tfidf)[:, 1]  # Probability of positive class

        # Calculate all metrics
        accuracy = accuracy_score(y_test_fold, y_pred)
        precision = precision_score(y_test_fold, y_pred, average='binary')
        recall = recall_score(y_test_fold, y_pred, average='binary')
        f1 = f1_score(y_test_fold, y_pred, average='binary')
        auc = roc_auc_score(y_test_fold, y_pred_proba)

        # Calculate AUPR (Average Precision)
        aupr = average_precision_score(y_test_fold, y_pred_proba)

        results.append({
            'fold': fold + 1,
            'classifier': clf_name,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc': auc,
            'aupr': aupr
        })

        print(f"{clf_name}: Acc = {accuracy:.4f}, Prec = {precision:.4f}, Rec = {recall:.4f}, F1 = {f1:.4f}, AUC = {auc:.4f}, AUPR = {aupr:.4f}")

# Convert results to DataFrame
results_df = pd.DataFrame(results)

mean_results = results_df.groupby('classifier').agg({
    'accuracy': ['mean', 'std'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1_score': ['mean', 'std'],
    'auc': ['mean', 'std'],
    'aupr': ['mean', 'std']
}).round(4)

print("\n" + "="*80)
print("CROSS-VALIDATION RESULTS SUMMARY")
print("="*80)
print(mean_results)

print("\nDetailed results by fold:")
print(results_df)

results_df.to_csv('cross_validation_results_with_auc_aupr.csv', index=False)

summary_table = pd.DataFrame({
    'Classifier': mean_results.index,
    'Accuracy (mean±std)': [f"{mean_results['accuracy']['mean'][i]:.4f}±{mean_results['accuracy']['std'][i]:.4f}" for i in range(len(mean_results))],
    'Precision (mean±std)': [f"{mean_results['precision']['mean'][i]:.4f}±{mean_results['precision']['std'][i]:.4f}" for i in range(len(mean_results))],
    'Recall (mean±std)': [f"{mean_results['recall']['mean'][i]:.4f}±{mean_results['recall']['std'][i]:.4f}" for i in range(len(mean_results))],
    'F1-Score (mean±std)': [f"{mean_results['f1_score']['mean'][i]:.4f}±{mean_results['f1_score']['std'][i]:.4f}" for i in range(len(mean_results))],
    'AUC (mean±std)': [f"{mean_results['auc']['mean'][i]:.4f}±{mean_results['auc']['std'][i]:.4f}" for i in range(len(mean_results))],
    'AUPR (mean±std)': [f"{mean_results['aupr']['mean'][i]:.4f}±{mean_results['aupr']['std'][i]:.4f}" for i in range(len(mean_results))]
})

print("\n" + "="*100)
print("FORMATTED SUMMARY TABLE")
print("="*100)
print(summary_table.to_string(index=False))