In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import numpy as np

In [7]:
from feature_extraction.CTFeatureExtraction import CTFeatureExtraction
from feature_extraction.QSOFeatureExtraction import QSOFeatureExtraction
from feature_extraction.GTPCFeatureExtraction import GTPCFeatureExtraction
from feature_extraction.GDPCFeatureExtraction import GDPCFeatureExtraction
from feature_extraction.CTDFeatureExtraction import CTDFeatureExtraction
from feature_extraction.CKSAAPFeatureExtraction import CKSAAPFeatureExtraction
from feature_extraction.AAIFeatureExtraction import AAIFeatureExtraction
from feature_extraction.DDEFeatureExtraction import DDEFeatureExtraction
from feature_extraction.DPCFeatureExtraction import DPCFeatureExtraction
from feature_extraction.KAACFeatureExtraction import KAACFeatureExtraction

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Extract labels for model training
labels = data['folding_type'].values

# Initialize the feature extraction objects
kaac_extractor = KAACFeatureExtraction()
dpc_extractor = DPCFeatureExtraction()
dde_extractor = DDEFeatureExtraction()
aai_extractor = AAIFeatureExtraction()
cksaap_extractor = CKSAAPFeatureExtraction()
ctd_extractor = CTDFeatureExtraction()
gdpc_extractor = GDPCFeatureExtraction()
gtpc_extractor = GTPCFeatureExtraction()
qso_extractor = QSOFeatureExtraction()
ct_extractor = CTFeatureExtraction()

# Extract features using feature extracting methods
kaac_features = np.array([kaac_extractor.calculate_kaac_features(seq) for seq in data['sequence']])
dpc_features = np.array([dpc_extractor.calculate_dpc_features(seq) for seq in data['sequence']])
dde_features = np.array([dde_extractor.calculate_dde_features(seq) for seq in data['sequence']])
aai_features = np.array([aai_extractor.calculate_aai_features(seq) for seq in data['sequence']])
cksaap_features = np.array([cksaap_extractor.calculate_cksaap_features(seq) for seq in data['sequence']])
ctd_features = np.array([ctd_extractor.calculate_ctd_features(seq) for seq in data['sequence']])
gdpc_features = np.array([gdpc_extractor.calculate_gdpc_features(seq) for seq in data['sequence']])
gtpc_features = np.array([gtpc_extractor.calculate_gtpc_features(seq) for seq in data['sequence']])
qso_features = np.array([qso_extractor.calculate_qso_features(seq) for seq in data['sequence']])
ct_features = np.array([ct_extractor.calculate_ct_features(seq) for seq in data['sequence']])

# Combine the extracted features
combined_features = np.concatenate((kaac_features, dpc_features, dde_features), axis=1)

In [3]:
combined_features.shape

(141, 821)

In [6]:
from sklearn.model_selection import LeaveOneOut

# SVM with Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(combined_features):
    X_train, X_test = combined_features[train_index], combined_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    # Scale the features for the current split
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    clf = SVC(kernel='linear')
    clf.fit(X_train_scaled, y_train)
    y_pred.append(clf.predict(X_test_scaled)[0])
    y_true.append(y_test[0])

In [7]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[70 19]
 [25 27]]

Accuracy (ACC): 0.69
Matthews Correlation Coefficient (MCC): 0.31

Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.79      0.76        89
           1       0.59      0.52      0.55        52

    accuracy                           0.69       141
   macro avg       0.66      0.65      0.66       141
weighted avg       0.68      0.69      0.68       141



# Kernel SVM with hyperparameter tuning & validate with LOOCV

In [10]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Define the parameter grid for grid search
param_grid = {
    'C': [2**i for i in range(-15, 16, 2)],
    'gamma': [2**i for i in range(-15, 4, 2)],
    'kernel': ['rbf']
}

# Create an SVM classifier
clf = SVC()

# Perform grid search with leave-one-out cross-validation
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=loo, n_jobs=-1, verbose=2)
grid_search.fit(scaled_features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Get the best classifier
best_clf = grid_search.best_estimator_

# Perform leave-one-out cross-validation with the best classifier
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(scaled_features):
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_clf.fit(X_train, y_train)
    y_pred.append(best_clf.predict(X_test)[0])
    y_true.append(y_test[0])

Fitting 5 folds for each of 160 candidates, totalling 800 fits
Best parameters:  {'C': 2, 'gamma': 0.00048828125, 'kernel': 'rbf'}
Best score:  0.6458128078817734


In [9]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'KSVM')
cm.evaluate()

Confusion Matrix: $KSVM
[[79 10]
 [30 22]]

Accuracy (ACC): 0.72
Matthews Correlation Coefficient (MCC): 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.89      0.80        89
           1       0.69      0.42      0.52        52

    accuracy                           0.72       141
   macro avg       0.71      0.66      0.66       141
weighted avg       0.71      0.72      0.70       141



# RandomForest RF with hyperparameter tuning & validate with LOOCV

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, LeaveOneOut

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': np.arange(50, 1001, 10),  # mtree: 50 to 1000 with step size of 10
    'max_features': np.arange(1, 16, 1),      # ntry: 1 to 15 with step size of 1
    'min_samples_split': np.arange(2, 11, 1)  # msplit: 2 to 10 with step size of 1
}

# Create a Random Forest classifier
rf_clf = RandomForestClassifier(random_state=42)

# Perform grid search with leave-one-out cross-validation
grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(scaled_features, labels)

# Get the best classifier
best_rf_clf = grid_search.best_estimator_

# Perform leave-one-out cross-validation with the best classifier
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(scaled_features):
    X_train, X_test = scaled_features[train_index], scaled_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_rf_clf.fit(X_train, y_train)
    y_pred.append(best_rf_clf.predict(X_test)[0])
    y_true.append(y_test[0])

Fitting 3 folds for each of 12960 candidates, totalling 38880 fits


In [5]:
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'Random Forest')
cm.evaluate()

Confusion Matrix: $KAAC
[[83  6]
 [21 31]]

Accuracy (ACC): 0.81
Matthews Correlation Coefficient (MCC): 0.58

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.93      0.86        89
           1       0.84      0.60      0.70        52

    accuracy                           0.81       141
   macro avg       0.82      0.76      0.78       141
weighted avg       0.81      0.81      0.80       141



In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
# from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from scikeras.wrappers import KerasClassifier

# Split the data into features and labels
X = combined_features
y = labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the ANN model
def create_model(optimizer='adam', activation='relu', dropout_rate=0.5, units=64):
    model = Sequential()
    model.add(Dense(units, activation=activation, input_shape=(X_train_scaled.shape[1],)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Define the hyperparameter grid
param_grid = {
    'optimizer': ['adam', 'rmsprop'],
    'model__activation': ['relu', 'tanh'],
    'model__dropout_rate': [0.3, 0.5],
    'model__units': [32, 64]
}

# Create the KerasClassifier wrapper
model = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

# Perform grid search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get the best model and hyperparameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best hyperparameters: ", best_params)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test_scaled)
y_pred = np.round(y_pred).astype(int)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Best hyperparameters:  {'model__activation': 'relu', 'model__dropout_rate': 0.5, 'model__units': 64, 'optimizer': 'rmsprop'}
Accuracy: 0.72
Precision: 0.71
Recall: 0.71
F1-score: 0.71
