In [1]:
import sqlite3
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import time
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix




In [2]:
import sqlite3
import pandas as pd
con = sqlite3.connect("../unified_CD2.db")
cur = con.cursor()

df = pd.read_sql_query("SELECT * FROM prod_desc", con)
df['AB'] = df['AB'].apply(lambda x: int.from_bytes(x, "little"))
columns_to_drop = ["index", "id", "name", "description", "OX", "source", "valid"]
df = df.drop(columns=columns_to_drop)
df


In [3]:
# split of data to target
X = df.drop(columns=['AB',"seq"])
y = df['AB']


#Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)










In [4]:
# Initial Model Training and Evaluation(4 min)
initial_model = SVC()

# Fit the initial model
initial_model.fit(X_train, y_train)

# Evaluate initial model accuracy on the test set
initial_model_accuracy = accuracy_score(y_test, initial_model.predict(X_test))
print("Initial Model Accuracy:", initial_model_accuracy)

# Generate classification report
y_pred_initial = initial_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred_initial))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_initial))

# Perform cross-validation
cv_score_initial = cross_val_score(initial_model, X_train, y_train, cv=5)


# Print cross-validation scores
print("Cross-Validation Scores:", cv_score_initial)
print("Mean Cross-Validation Accuracy:", np.mean(cv_score_initial))


Initial Model Accuracy: 0.7379929714955096
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.74      0.72      1174
           1       0.77      0.73      0.75      1387

    accuracy                           0.74      2561
   macro avg       0.74      0.74      0.74      2561
weighted avg       0.74      0.74      0.74      2561

Confusion Matrix:
[[ 872  302]
 [ 369 1018]]
Cross-Validation Scores: [0.73486328 0.72167969 0.73535156 0.7578125  0.73486328]
Mean Cross-Validation Accuracy: 0.7369140625


In [4]:
from sklearn.feature_selection import VarianceThreshold

# Remove constant features
constant_filter = VarianceThreshold(threshold=0)
X_train_nonconstant = constant_filter.fit_transform(X_train, y_train)
X_test_nonconstant = constant_filter.transform(X_test)

# Feature selection on non-constant features
selector = SelectKBest(f_classif, k=75)
X_train_selected = selector.fit_transform(X_train_nonconstant, y_train)
X_test_selected = selector.transform(X_test_nonconstant)


In [5]:


# Model Training and Evaluation after Feature Selection (2min)
selected_model = SVC()

# Perform cross-validation
cv_scores = cross_val_score(selected_model, X_train_selected, y_train, cv=5)

# Train the model
selected_model.fit(X_train_selected, y_train)

# Evaluate model accuracy
selected_model_accuracy = accuracy_score(y_test, selected_model.predict(X_test_selected))
print("Model Accuracy after Feature Selection (without cross-validation):", selected_model_accuracy)

# Generate classification report
y_pred_selected = selected_model.predict(X_test_selected)
print("Classification Report:")
print(classification_report(y_test, y_pred_selected))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_selected))

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Accuracy:", np.mean(cv_scores))


Model Accuracy after Feature Selection (without cross-validation): 0.7516595080046856
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.75      0.73      1174
           1       0.78      0.75      0.77      1387

    accuracy                           0.75      2561
   macro avg       0.75      0.75      0.75      2561
weighted avg       0.75      0.75      0.75      2561

Confusion Matrix:
[[ 879  295]
 [ 341 1046]]
Cross-Validation Scores: [0.73583984 0.73339844 0.73242188 0.75488281 0.73925781]
Mean Cross-Validation Accuracy: 0.73916015625


In [5]:
import numpy as np
from sklearn.decomposition import PCA

# Perform PCA with explained variance threshold of 95%
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_selected)
X_test_pca = pca.transform(X_test_selected)

In [22]:




#Model Training and Evaluation after PCA (10 min)
pca_model = SVC()

# Perform k-fold cross-validation
cv_scores = cross_val_score(pca_model, X_train_pca, y_train, cv=5)


# Fit the model on the entire training set
pca_model.fit(X_train_pca, y_train)

# Evaluate model accuracy on the test set
pca_model_accuracy = accuracy_score(y_test, pca_model.predict(X_test_pca))
print("Model Accuracy after PCA:", pca_model_accuracy)

# Step 1: Performance Metrics
# Generate classification report
y_pred = pca_model.predict(X_test_pca)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Print out the number of retained principal components
print("Number of retained principal components:", pca.n_components_)


Model Accuracy after PCA: 0.7126122608356111
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.67      0.68      1174
           1       0.73      0.75      0.74      1387

    accuracy                           0.71      2561
   macro avg       0.71      0.71      0.71      2561
weighted avg       0.71      0.71      0.71      2561

Confusion Matrix:
[[ 791  383]
 [ 353 1034]]
Cross-Validation Scores: [0.70166016 0.70019531 0.703125   0.72900391 0.71142578]
Mean CV Accuracy: 0.70908203125
Number of retained principal components: 8


In [None]:
# Hyperparameter Tuning (grid search) fpr pca data (15 min)
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf','poly', 'sigmoid']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_pca, y_train)

# Get the best hyperparameters and kernel
best_hyperparams = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparams)

# Evaluate best model accuracy
best_model_accuracy = accuracy_score(y_test, grid_search.best_estimator_.predict(X_test_pca))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)

# Generate classification report for best model
y_pred_best = grid_search.best_estimator_.predict(X_test_pca)
print("Classification Report for Best Model:")
print(classification_report(y_test, y_pred_best))

# Generate confusion matrix for best model
print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))


In [None]:
# Hyperparameter Tuning (grid search) for just feature selection (15 min)
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf','poly', 'sigmoid']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)

# Get the best hyperparameters and kernel
best_hyperparams = grid_search.best_params_
print("Best Hyperparameters:", best_hyperparams)

# Evaluate best model accuracy
best_model_accuracy = accuracy_score(y_test, grid_search.best_estimator_.predict(X_test_selected))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)

# Generate classification report for best model
y_pred_best = grid_search.best_estimator_.predict(X_test_selected)
print("Classification Report for Best Model:")
print(classification_report(y_test, y_pred_best))

# Generate confusion matrix for best model
print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))


In [19]:

num_elements = len(X_train_selected)

print("Number of elements in the array:", num_elements)

Number of elements in the array: 10240


In [None]:
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif

# Define the bounds of the search space
pbounds = {'expC': (1, 4), 'expgamma': (-1, 3)}
# Constants outside the objective function
kernel = "poly"

# Define the objective function to optimize
def objective_function(expC, expgamma):
    svm_model = SVC(C=10 **expC, gamma=10**expgamma, kernel=kernel)
    cv_scores = cross_val_score(svm_model, X_train_selected, y_train, cv=5)
    mean_cv_accuracy = np.mean(cv_scores)
    
    # Track optimization progress
    print(f"expC: {expC}, expgamma: {expgamma}, CV Accuracy: {mean_cv_accuracy}")
    
    return mean_cv_accuracy

# Initialize Bayesian optimization search
bayes_search = BayesianOptimization(
    f=objective_function,
    pbounds=pbounds,
    random_state=42  # Setting the random state for reproducibility
)

# Perform Bayesian optimization
bayes_search.maximize(
    init_points=2,  # Number of random points to sample before optimization
    n_iter=10,      # Number of optimization iterations
)

# Get the best hyperparameters
best_hyperparams = bayes_search.max['params']
print("Best Hyperparameters:", best_hyperparams)


|   iter    |  target   |   expC    | expgamma  |
-------------------------------------------------


In [None]:
# Train the final model using the best hyperparameters
best_kernel = 'poly'  # Polynomial kernel
best_svm_model = SVC(C=10 ** best_hyperparams['expC'], gamma=10 ** best_hyperparams['expgamma'], kernel=kernel)
best_svm_model.fit(X_train_selected, y_train)

# Evaluate the final model
cv_scores_final_model = cross_val_score(best_svm_model, X_train_selected, y_train, cv=5)
accuracy_final_model = np.mean(cv_scores_final_model)

# Generate classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Calculate and print accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on Test Set:", test_accuracy)

# Print accuracy of the final model after cross-validation
print("Accuracy of Final Model after Cross-Validation:", accuracy_final_model)