In [1]:
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

print('Libraries imported successfully')

Libraries imported successfully


In [2]:
# import FeatureExtraction class
from feature_extraction.feature_extraction import FeatureExtraction
feature_extractor = FeatureExtraction()


In [3]:
# Load the dataset
file_path = './data/Final_2Sm_modified_with_sequences.xlsx'  # Update this path
data = pd.read_excel(file_path)

In [6]:
# Apply feature extraction to each sequence and prepare the dataset
feature_vectors = []
for sequence in data['sequence']:
    features_aac = feature_extractor.calculate_aac_with_length(sequence)
    features_dpc = feature_extractor.calculate_dpc(sequence)
    features_gdpc = feature_extractor.calculate_gdpc(sequence)
    features_gtpc = feature_extractor.calculate_gtpc(sequence)
    features_cksaap = feature_extractor.calculate_cksaap(sequence)
    features_ct = feature_extractor.calculate_ct(sequence)
    features_dde = feature_extractor.calculate_dde(sequence)
    combined_features = features_aac + features_dpc + features_gdpc + features_gtpc + features_cksaap + features_ct + features_dde
    feature_vectors.append(combined_features)
    
print(pd.DataFrame(feature_vectors).head())

X = pd.DataFrame(feature_vectors)
y = data['folding_type']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

       0         1         2         3         4         5         6     \
0  0.037736  0.000000  0.018868  0.113208  0.037736  0.075472  0.000000   
1  0.018868  0.000000  0.037736  0.056604  0.018868  0.056604  0.018868   
2  0.076190  0.000000  0.085714  0.114286  0.028571  0.047619  0.038095   
3  0.121495  0.009346  0.074766  0.121495  0.028037  0.046729  0.037383   
4  0.090000  0.030000  0.080000  0.080000  0.010000  0.040000  0.010000   

       7         8         9     ...      3704      3705      3706      3707  \
0  0.037736  0.094340  0.056604  ... -0.179629 -0.254103 -0.359549  4.291649   
1  0.018868  0.150943  0.113208  ... -0.179629 -0.254103 -0.359549 -0.254103   
2  0.038095  0.104762  0.123810  ... -0.179629 -0.254103 -0.359549 -0.254103   
3  0.037383  0.121495  0.074766  ... -0.179629 -0.254103 -0.359549  1.975888   
4  0.040000  0.050000  0.100000  ... -0.179629 -0.254103  1.329696 -0.254103   

       3708      3709      3710      3711      3712      3713  
0 -0

In [7]:
# Train the SVM classifier
clf = svm.SVC(kernel='linear')
clf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8275862068965517
Classification Report:
               precision    recall  f1-score   support

          2S       0.86      0.80      0.83        15
         N2S       0.80      0.86      0.83        14

    accuracy                           0.83        29
   macro avg       0.83      0.83      0.83        29
weighted avg       0.83      0.83      0.83        29
