In [11]:
from collections import defaultdict

class CTDFeatureExtraction:
    def __init__(self):
        self.property_groups = {
            'hydrophobicity': {'P': 'RKEDQN', 'H': 'GASTPHY', 'N': 'CLVIMFW'},
            'normalized_vdw': {'P': 'GASTPD', 'H': 'NVEQIL', 'N': 'MHKFRYW'},
            'polarity': {'P': 'LIFWCMVY', 'H': 'PATGS', 'N': 'HQRKNED'},
            'polarizability': {'P': 'GASDT', 'H': 'CPNVEQIL', 'N': 'KMHFRYW'},
            'charge': {'P': 'KR', 'H': 'ANCQGHILMFPSTWYV', 'N': 'DE'},
            'solvent_accessibility': {'P': 'ALFCGIVW', 'H': 'RKQEND', 'N': 'MPSTHY'},
            'secondary_structure': {'P': 'EALMQKRH', 'H': 'VIYCWFT', 'N': 'GNPSD'}
        }

    def calculate_ctd_features(self, sequence):
        """
        Calculate the CTD features for a given protein sequence.

        :param sequence: The protein sequence as a string.
        :return: A list of CTD feature values.
        """
        ctd_features = []

        for property_name, property_groups in self.property_groups.items():
            c_features = self._calculate_c_features(sequence, property_groups)
            t_features = self._calculate_t_features(sequence, property_groups)
            d_features = self._calculate_d_features(sequence, property_groups)
            ctd_features.extend(c_features + t_features + d_features)

        return ctd_features

    def _calculate_c_features(self, sequence, property_groups):
        """
        Calculate the composition (C) features.

        :param sequence: The protein sequence as a string.
        :param property_groups: The property groups dictionary.
        :return: A list of C feature values.
        """
        c_features = []
        sequence_length = len(sequence)

        for group in ['P', 'H', 'N']:
            count = sum(1 for aa in sequence if aa in property_groups[group])
            c_features.append(count / sequence_length)

        return c_features

    def _calculate_t_features(self, sequence, property_groups):
        """
        Calculate the transition (T) features.

        :param sequence: The protein sequence as a string.
        :param property_groups: The property groups dictionary.
        :return: A list of T feature values.
        """
        t_features = []
        sequence_length = len(sequence)

        for group_pair in [('N', 'P'), ('H', 'N'), ('P', 'H')]:
            count = 0
            for i in range(sequence_length - 1):
                if sequence[i] in property_groups[group_pair[0]] and sequence[i + 1] in property_groups[group_pair[1]]:
                    count += 1
                elif sequence[i] in property_groups[group_pair[1]] and sequence[i + 1] in property_groups[group_pair[0]]:
                    count += 1
            t_features.append(count / (sequence_length - 1))

        return t_features

    def _calculate_d_features(self, sequence, property_groups):
        """
        Calculate the distribution (D) features.

        :param sequence: The protein sequence as a string.
        :param property_groups: The property groups dictionary.
        :return: A list of D feature values.
        """
        d_features = []
        sequence_length = len(sequence)

        for group in ['P', 'H', 'N']:
            indices = [i for i, aa in enumerate(sequence) if aa in property_groups[group]]
            if indices:
                d_features.append(indices[0] / sequence_length)
                d_features.append((indices[-1] - indices[0] + 1) / sequence_length)
                d_features.append((indices[-1] + 1) / sequence_length)
                d_features.append(len(indices) / sequence_length)
                d_features.append(sum(indices) / (sequence_length * len(indices)))
            else:
                d_features.extend([0] * 5)

        return d_features

# Example usage
# sequence = "ACDEFGHIKLMNPQRSTVWY"
# extractor = CTDFeatureExtraction()
# ctd_features = extractor.calculate_ctd_features(sequence)
# print(ctd_features)

In [12]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, matthews_corrcoef
from sklearn.model_selection import LeaveOneOut
import numpy as np

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Now, when you extract labels for model training:
labels = data['folding_type'].values

# Initialize the FeatureExtraction class
feature_extraction = CTDFeatureExtraction()

# Feature extraction using AAC with length
features = np.array([feature_extraction.calculate_ctd_features(seq) for seq in data['sequence']])

# Support Vector Machine (SVM) Classifier

In [3]:
# SVM with Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)
    y_pred.append(clf.predict(X_test)[0])
    y_true.append(y_test[0])

In [4]:
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[88  1]
 [52  0]]

Accuracy (ACC): 0.62
Matthews Correlation Coefficient (MCC): -0.06

Classification Report:
              precision    recall  f1-score   support

           0       0.63      0.99      0.77        89
           1       0.00      0.00      0.00        52

    accuracy                           0.62       141
   macro avg       0.31      0.49      0.38       141
weighted avg       0.40      0.62      0.49       141


# Random Forest Classifier with Leave-One-Out Cross-Validation (LOOCV) 

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Grid search with cross validation setup
grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, scoring='accuracy')

# Fit the grid search to find the best parameters
grid_search.fit(features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Using the best parameters with LOOCV
best_rf = RandomForestClassifier(**best_params, random_state=42)
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_rf.fit(X_train, y_train)
    y_pred.append(best_rf.predict(X_test)[0])
    y_true.append(y_test[0])

Best parameters:  {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Best score:  0.6453900709219859


In [8]:
# Evaluate the model
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'CTD')
cm.evaluate()

Confusion Matrix: $CTD
[[80  9]
 [31 21]]

Accuracy (ACC): 0.72
Matthews Correlation Coefficient (MCC): 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80        89
           1       0.70      0.40      0.51        52

    accuracy                           0.72       141
   macro avg       0.71      0.65      0.66       141
weighted avg       0.71      0.72      0.69       141


# kernel support vector machine (SVM) classifier with Leave-One-Out Cross-Validation (LOOCV)

In [13]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Define the parameter grid
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [3, 4, 5]  # This is only used by the 'poly' kernel
}

# Initialize the SVM classifier
svm = SVC(random_state=42)

# Grid search with cross validation setup
grid_search_svm = GridSearchCV(estimator=svm, param_grid=svm_param_grid, cv=3, scoring='accuracy')

# Fit the grid search to find the best parameters
grid_search_svm.fit(features, labels)

# Get the best parameters and best score
best_params_svm = grid_search_svm.best_params_
best_score_svm = grid_search_svm.best_score_

print("Best parameters for SVM: ", best_params_svm)
print("Best score for SVM: ", best_score_svm)

# Using the best parameters with LOOCV for SVM
best_svm = SVC(**best_params_svm, random_state=42)
loo = LeaveOneOut()
y_true_svm, y_pred_svm = [], []

for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_svm.fit(X_train, y_train)
    y_pred_svm.append(best_svm.predict(X_test)[0])
    y_true_svm.append(y_test[0])

Best parameters for SVM:  {'C': 0.1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Best score for SVM:  0.6312056737588653


In [14]:
# Evaluate the model
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'CTD')
cm.evaluate()

Confusion Matrix: $CTD
[[80  9]
 [31 21]]

Accuracy (ACC): 0.72
Matthews Correlation Coefficient (MCC): 0.36

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.90      0.80        89
           1       0.70      0.40      0.51        52

    accuracy                           0.72       141
   macro avg       0.71      0.65      0.66       141
weighted avg       0.71      0.72      0.69       141
