In [2]:
from collections import Counter
from itertools import product

class GTPCFeatureExtraction:
    def __init__(self):
        self.amino_acid_groups = {
            'g1': 'FYW',  # Aromatic
            'g2': 'RKH',  # Positively charged
            'g3': 'GAVLMI',  # Aliphatic
            'g4': 'STCPNQ',  # Uncharged
            'g5': 'DE'  # Negatively charged
        }
        self.group_names = list(self.amino_acid_groups.keys())

    def calculate_gtpc_features(self, sequence):
        """
        Calculate the GTPC features for a given protein sequence.

        :param sequence: The protein sequence as a string.
        :return: A list of GTPC feature values.
        """
        # Create a mapping of amino acids to their corresponding groups
        amino_acid_to_group = {}
        for group, amino_acids in self.amino_acid_groups.items():
            for amino_acid in amino_acids:
                amino_acid_to_group[amino_acid] = group

        # Generate all possible group tripeptide combinations
        group_tripeptides = [''.join(group) for group in product(self.group_names, repeat=3)]

        # Initialize a dictionary to store the counts of each group tripeptide
        group_tripeptide_counts = {group_tripeptide: 0 for group_tripeptide in group_tripeptides}

        # Count the occurrences of each group tripeptide in the sequence
        for i in range(len(sequence) - 2):
            tripeptide = sequence[i:i+3]
            group_tripeptide = ''.join([amino_acid_to_group.get(amino_acid, '') for amino_acid in tripeptide])
            if group_tripeptide in group_tripeptide_counts:
                group_tripeptide_counts[group_tripeptide] += 1

        # Calculate the sequence length
        sequence_length = len(sequence)

        # Calculate the GTPC feature values using the formula
        gtpc_features = [group_tripeptide_counts[group_tripeptide] / (sequence_length - 2) for group_tripeptide in group_tripeptides]

        return gtpc_features

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, matthews_corrcoef
from sklearn.model_selection import LeaveOneOut
import numpy as np

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Now, when you extract labels for model training:
labels = data['folding_type'].values

# Initialize the FeatureExtraction class
feature_extraction = GTPCFeatureExtraction()

# Feature extraction using AAC with length
features = np.array([feature_extraction.calculate_gtpc_features(seq) for seq in data['sequence']])

In [4]:
features.shape

(141, 125)

# Support Vector Machine (SVM) Classifier

In [3]:
# SVM with Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    clf = SVC(kernel='linear')
    clf.fit(X_train, y_train)
    y_pred.append(clf.predict(X_test)[0])
    y_true.append(y_test[0])

In [4]:
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'KAAC')
cm.evaluate()

Confusion Matrix: $KAAC
[[89  0]
 [52  0]]

Accuracy (ACC): 0.63
Matthews Correlation Coefficient (MCC): 0.00

Classification Report:
              precision    recall  f1-score   support

           0       0.63      1.00      0.77        89
           1       0.00      0.00      0.00        52

    accuracy                           0.63       141
   macro avg       0.32      0.50      0.39       141
weighted avg       0.40      0.63      0.49       141


# RANDOM FOREST (RF) Implementation with Hyperparameter Tuning

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Grid search with cross validation setup
grid_search = GridSearchCV(estimator=rf, param_grid=rf_param_grid, cv=3, scoring='accuracy')

# Fit the grid search to find the best parameters
grid_search.fit(features, labels)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best parameters: ", best_params)
print("Best score: ", best_score)

# Using the best parameters with LOOCV
best_rf = RandomForestClassifier(**best_params, random_state=42)
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_index, test_index in loo.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_rf.fit(X_train, y_train)
    y_pred.append(best_rf.predict(X_test)[0])
    y_true.append(y_test[0])

Best parameters:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
Best score:  0.6666666666666666


In [9]:
# Calculate and display the confusion matrix
from ClassificationMatrix import ClassificationMatrix

cm = ClassificationMatrix(y_true, y_pred, 'GTPC')
cm.evaluate()

Confusion Matrix: $GTPC
[[81  8]
 [34 18]]

Accuracy (ACC): 0.70
Matthews Correlation Coefficient (MCC): 0.32

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.91      0.79        89
           1       0.69      0.35      0.46        52

    accuracy                           0.70       141
   macro avg       0.70      0.63      0.63       141
weighted avg       0.70      0.70      0.67       141
