In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
import numpy as np

In [8]:
from feature_extraction.CTFeatureExtraction import CTFeatureExtraction
from feature_extraction.QSOFeatureExtraction import QSOFeatureExtraction
from feature_extraction.GTPCFeatureExtraction import GTPCFeatureExtraction
from feature_extraction.GDPCFeatureExtraction import GDPCFeatureExtraction
from feature_extraction.CTDFeatureExtraction import CTDFeatureExtraction
from feature_extraction.CKSAAPFeatureExtraction import CKSAAPFeatureExtraction
from feature_extraction.AAIFeatureExtraction import AAIFeatureExtraction
from feature_extraction.DDEFeatureExtraction import DDEFeatureExtraction
from feature_extraction.DPCFeatureExtraction import DPCFeatureExtraction
from feature_extraction.KAACFeatureExtraction import KAACFeatureExtraction

# Load the dataset
data = pd.read_excel('../data/Final_2Sm_modified_with_sequences.xlsx')

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit the encoder to the folding_type column and transform it to numeric labels
data['folding_type'] = label_encoder.fit_transform(data['folding_type'])

# Extract labels for model training
labels = data['folding_type'].values

# Initialize the feature extraction objects
kaac_extractor = KAACFeatureExtraction()
dpc_extractor = DPCFeatureExtraction()
dde_extractor = DDEFeatureExtraction()
aai_extractor = AAIFeatureExtraction()
cksaap_extractor = CKSAAPFeatureExtraction()
ctd_extractor = CTDFeatureExtraction()
gdpc_extractor = GDPCFeatureExtraction()
gtpc_extractor = GTPCFeatureExtraction()
qso_extractor = QSOFeatureExtraction()
ct_extractor = CTFeatureExtraction()

# Extract features using feature extracting methods
kaac_features = np.array([kaac_extractor.calculate_kaac_features(seq) for seq in data['sequence']])
dpc_features = np.array([dpc_extractor.calculate_dpc_features(seq) for seq in data['sequence']])
dde_features = np.array([dde_extractor.calculate_dde_features(seq) for seq in data['sequence']])
aai_features = np.array([aai_extractor.calculate_aai_features(seq) for seq in data['sequence']])
cksaap_features = np.array([cksaap_extractor.calculate_cksaap_features(seq) for seq in data['sequence']])
ctd_features = np.array([ctd_extractor.calculate_ctd_features(seq) for seq in data['sequence']])
gdpc_features = np.array([gdpc_extractor.calculate_gdpc_features(seq) for seq in data['sequence']])
gtpc_features = np.array([gtpc_extractor.calculate_gtpc_features(seq) for seq in data['sequence']])
qso_features = np.array([qso_extractor.calculate_qso_features(seq) for seq in data['sequence']])
ct_features = np.array([ct_extractor.calculate_ct_features(seq) for seq in data['sequence']])

# Combine the extracted features
combined_features = np.concatenate((kaac_features, dpc_features, dde_features), axis=1)

In [3]:
combined_features.shape

(141, 821)

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm_notebook

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Create a pipeline with scaling and ANN classifier
pipeline = Pipeline([
    ('ann', MLPClassifier(random_state=42))
])

# Define the parameter grid for grid search
param_grid = {
    'ann__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100)],
    'ann__activation': ['relu', 'logistic', 'tanh'],
    'ann__solver': ['adam', 'sgd'],
    'ann__alpha': [0.0001, 0.001, 0.01, 0.1],
    'ann__learning_rate_init': [0.001, 0.01, 0.1],
    'ann__learning_rate': ['constant', 'adaptive', 'invscaling'],
    'ann__max_iter': [500, 1000, 2000],
    'ann__tol': [1e-4, 1e-5, 1e-6]
}

# Create a custom scorer that returns the score and prints progress
def custom_scorer(estimator, X, y):
    score = estimator.score(X, y)
    print(f"Score: {score:.3f}")
    return score

# Perform grid search with 3-fold cross-validation and progress bar
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    scoring=custom_scorer,
    verbose=0
)

# Wrap the grid search with tqdm_notebook
with tqdm_notebook(total=len(grid_search.param_grid), desc="Grid Search") as progress_bar:
    for _ in grid_search.fit(scaled_features, labels):
        progress_bar.update(1)

# Get the best classifier
best_ann_clf = grid_search.best_estimator_

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  with tqdm_notebook(total=len(grid_search.param_grid), desc="Grid Search") as progress_bar:


Grid Search:   0%|          | 0/8 [00:00<?, ?it/s]



TypeError: 'GridSearchCV' object is not iterable

In [None]:
from sklearn.model_selection import LeaveOneOut
from feature_extraction.ClassificationMatrix import ClassificationMatrix

# Perform leave-one-out cross-validation with the best classifier
loo = LeaveOneOut()
y_true, y_pred = [], []
for train_index, test_index in loo.split(combined_features):
    X_train, X_test = combined_features[train_index], combined_features[test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    best_ann_clf.fit(X_train, y_train)
    y_pred.append(best_ann_clf.predict(X_test)[0])
    y_true.append(y_test[0])

# Calculate and display the confusion matrix
cm = ClassificationMatrix(y_true, y_pred, 'ANN Classifier')
cm.evaluate()



In [2]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

# Scale the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(combined_features)

# Define the ANN model
def create_model(units=128, dropout_rate=0.2, optimizer='adam'):
    model = Sequential()
    model.add(Dense(units, activation='relu', input_shape=(scaled_features.shape[1],)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(units//2, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(len(np.unique(labels)), activation='softmax'))
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Create the ANN classifier
ann_classifier = KerasClassifier(build_fn=create_model, epochs=100, batch_size=32, verbose=0)

ModuleNotFoundError: No module named 'tensorflow.keras.wrappers'

In [15]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
from sklearn.metrics import accuracy_score, matthews_corrcoef, classification_report, confusion_matrix

# Scale the features using StandardScaler
scaler = StandardScaler()
# scaled_features = scaler.fit_transform(combined_features)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)
# Scale the features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define the hyperparameter search space
def build_model(hp):
    inputs = layers.Input(shape=(scaled_features.shape[1],))
    x = layers.Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), activation='relu')(inputs)

    for i in range(hp.Int('num_layers', 1, 5)):
        x = layers.Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation='relu')(x)

    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model

# Create a tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=3,
    directory='tuner_results',
    project_name='folding_type_classification'
)

# Perform hyperparameter tuning
tuner.search(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Get the best hyperparameters and build the best model
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
history = best_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)



Reloading Tuner from tuner_results\folding_type_classification\tuner0.json
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.5070 - loss: 0.8496 - val_accuracy: 0.5217 - val_loss: 1.4892
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7634 - loss: 0.4119 - val_accuracy: 0.6087 - val_loss: 0.8897
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.9731 - loss: 0.0905 - val_accuracy: 0.5217 - val_loss: 0.8900
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 1.0000 - loss: 0.0348 - val_accuracy: 0.5652 - val_loss: 1.0074
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 0.0113 - val_accuracy: 0.5652 - val_loss: 1.1716
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 1.0000 - loss: 0.0036 - val_accuracy: 0.5652 - val_loss: 1.3343
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0

In [13]:
# Evaluate the model on the test set
test_loss, test_acc = best_model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc:.4f}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.7586 - loss: 1.0769
Test accuracy: 0.7586


In [14]:
# Evaluate the model on the test set
y_pred_prob = best_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Matthews Correlation Coefficient:", mcc)
print("Classification Report:")
print(report)

# Calculate and display the confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
Accuracy: 0.7586206896551724
Matthews Correlation Coefficient: 0.545205169866058
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.93      0.80        15
           1       0.89      0.57      0.70        14

    accuracy                           0.76        29
   macro avg       0.79      0.75      0.75        29
weighted avg       0.79      0.76      0.75        29

Confusion Matrix:
[[14  1]
 [ 6  8]]


In [11]:
from tensorflow import keras
from tensorflow.keras import layers
from kerastuner import HyperModel, RandomSearch

# Initialize the scaler
scaler = StandardScaler()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_features, labels, test_size=0.2, random_state=42)

# Scale the features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define a class to create the ANN model with hyperparameter tuning
class ANNHyperModel(HyperModel):
    def __init__(self, input_shape, num_classes):
        self.input_shape = input_shape
        self.num_classes = num_classes

    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.InputLayer(input_shape=self.input_shape))

        # Tune the number of layers, units, and activation
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(layers.Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32),
                                   activation=hp.Choice('activation_' + str(i), ['relu', 'tanh', 'sigmoid'])))
            model.add(layers.Dropout(rate=hp.Float('dropout_' + str(i), min_value=0.0, max_value=0.5, step=0.1)))

        model.add(layers.Dense(self.num_classes, activation='softmax'))
        model.compile(optimizer=keras.optimizers.Adam(hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')),
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])
        return model

# Instantiate and configure the Keras Tuner
hypermodel = ANNHyperModel(input_shape=(X_train.shape[1],), num_classes=len(np.unique(labels)))
tuner = RandomSearch(
    hypermodel,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='keras_tuner_dir',
    project_name='keras_tuner_demo'
)

# Execute the hyperparameter search
tuner.search(X_train, y_train, epochs=50, validation_split=0.2)

# Retrieve the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model on the test data
loss, accuracy = best_model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}, Test loss: {loss}')

Reloading Tuner from keras_tuner_dir\keras_tuner_demo\tuner0.json


  trackable.load_own_variables(weights_store.get(inner_path))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.5862 - loss: 0.7209
Test accuracy: 0.5862069129943848, Test loss: 0.7209303379058838
