In [4]:
from collections import Counter

class KAACFeatureExtraction:
    def __init__(self):
        self.amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

    def calculate_kaac_features(self, sequence):
        """
        Calculate the KAAC features for a given protein sequence.

        :param sequence: The protein sequence as a string.
        :return: A list of KAAC feature values.
        """
        # Calculate the amino acid composition (AAC)
        aac_features = self.calculate_aac_features(sequence)

        # Calculate the sequence length (K)
        sequence_length = len(sequence)

        # Combine AAC features with sequence length
        kaac_features = aac_features + [sequence_length]

        return kaac_features

    def calculate_aac_features(self, sequence):
        """
        Calculate the amino acid composition (AAC) features.

        :param sequence: The protein sequence as a string.
        :return: A list of AAC feature values.
        """
        # Count the occurrences of each amino acid in the sequence
        amino_acid_counts = Counter(sequence)

        # Calculate the total number of amino acids in the sequence
        total_amino_acids = sum(amino_acid_counts.values())

        # Calculate the normalized frequency of each amino acid
        aac_features = [amino_acid_counts.get(aa, 0) / total_amino_acids for aa in self.amino_acids]

        return aac_features

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, matthews_corrcoef
from sklearn.model_selection import LeaveOneOut
import numpy as np

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Now, when you extract labels for model training:
labels = data['folding_type'].values

# Initialize the FeatureExtraction class
feature_extraction = KAACFeatureExtraction()

# Feature extraction using AAC with length
features = np.array([feature_extraction.calculate_kaac_features(seq) for seq in data['sequence']])



# SUPPORT VECTOR MACHINE (SVM) Implementation

In [None]:
# SVM with Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)
    y_pred.append(clf.predict(X_test)[0])
    y_true.append(y_test[0])

In [8]:
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[81  8]
 [22 30]]

Accuracy (ACC): 0.79
Matthews Correlation Coefficient (MCC): 0.53

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.91      0.84        89
           1       0.79      0.58      0.67        52

    accuracy                           0.79       141
   macro avg       0.79      0.74      0.76       141
weighted avg       0.79      0.79      0.78       141


# RANDOM FOREST (RF) Implementation with Hyperparameter Tuning

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Grid search with cross validation setup
grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, scoring='accuracy')

# Fit the grid search to find the best parameters
grid_search.fit(features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)


Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best score:  0.7588652482269503


In [None]:
# Using the best parameters with LOOCV
best_rf = RandomForestClassifier(**best_params, random_state=42)
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_rf.fit(X_train, y_train)
    y_pred.append(best_rf.predict(X_test)[0])
    y_true.append(y_test[0])

In [32]:
# Evaluate the model
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[84  5]
 [23 29]]

Accuracy (ACC): 0.80
Matthews Correlation Coefficient (MCC): 0.57

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.94      0.86        89
           1       0.85      0.56      0.67        52

    accuracy                           0.80       141
   macro avg       0.82      0.75      0.77       141
weighted avg       0.81      0.80      0.79       141


In [None]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut, ShuffleSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the parameter grid for Kernel SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'poly'],
    'gamma': ['scale', 'auto']
}

# Initialize the Kernel SVM classifier
svm = SVC(random_state=42)

# Grid search with cross-validation setup
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
grid_search = GridSearchCV(estimator=svm, param_grid=svm_param_grid, cv=cv, scoring='accuracy')

# Fit the grid search to find the best parameters
grid_search.fit(features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)



In [None]:
# Using the best parameters with LOOCV
best_svm = SVC(**best_params, random_state=42)
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_svm.fit(X_train, y_train)
    y_pred.append(best_svm.predict(X_test)[0])
    y_true.append(y_test[0])

In [None]:
# Evaluate the model
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical

  np.object,


AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.utils import to_categorical

# Reshape features into a 2D array suitable for CNN
features_2d = features.reshape(features.shape[0], features.shape[1], 1)

# Convert labels to categorical
labels_categorical = to_categorical(labels)

# Define the CNN model
def create_cnn_model(filters=32, kernel_size=3, activation='relu', dropout_rate=0.5):
    model = Sequential()
    model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation=activation, input_shape=(features_2d.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(64, activation=activation))
    model.add(Dropout(dropout_rate))
    model.add(Dense(labels_categorical.shape[1], activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model in KerasClassifier for GridSearchCV compatibility
model = KerasClassifier(build_fn=create_cnn_model)

In [None]:
# Define the parameter grid for tuning
param_grid = {
    'filters': [16, 32, 64],
    'kernel_size': [3, 5],
    'activation': ['relu', 'tanh'],
    'dropout_rate': [0.3, 0.5],
    'epochs': [50, 100],
    'batch_size': [32, 64]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(features_2d, labels_categorical)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)