In [8]:
import sqlite3
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import time
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count


con = sqlite3.connect("../unified_CD2.db")
cur = con.cursor()

df = pd.read_sql_query("SELECT * FROM prod_desc", con)
df["AB"] = df["AB"].apply(lambda x:int.from_bytes(x,"little"))
df = df[["seq", "AB"]]

csv_filename = "etidata.csv"
df.to_csv(csv_filename, index=False)

In [9]:
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import time
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Record the start time
start_time = time.time()

# Function to extract all features and handle errors gracefully
def extract_all_features(peptide):
    try:
        pro = PyPro.GetProDes(peptide)
        
        features = []
        
        # Amino acid composition
        features += list(pro.GetAAComp().values())
        
        # Dipeptide composition
        features += list(pro.GetDPComp().values())
        
        # Tripeptide composition
        features += list(pro.GetTPComp().values())
        
        # Moreau-Broto autocorrelation descriptors
        features += list(pro.GetMoreauBrotoAuto().values())
        
        # Moran autocorrelation descriptors
        features += list(pro.GetMoranAuto().values())
        
        # Geary autocorrelation descriptors
        features += list(pro.GetGearyAuto().values())
        
        # Quasi-sequence order descriptors
        features += list(pro.GetQSO().values())
        
        # Calculate additional physicochemical properties using Bio.SeqUtils.ProtParam
        analysed_seq = ProteinAnalysis(peptide)
        physchem_features = [
            analysed_seq.molecular_weight(),  # Molecular weight
            analysed_seq.isoelectric_point(),  # Isoelectric point (pI)
            analysed_seq.instability_index(),  # Instability index
            analysed_seq.gravy(),  # Hydrophobicity (GRAVY)
        ]
        features += physchem_features
        
        return features
    except Exception as e:
        print(f"Error processing sequence {peptide}: {e}")
        # Return a list of NaNs of the same length as the feature vector
        num_features = 544  # Total number of features
        return [np.nan] * num_features

# Apply function to each peptide sequence with multiprocessing
num_processes = cpu_count()  # Number of CPU cores
with Pool(processes=num_processes) as pool:
    features_list = list(tqdm(pool.imap(extract_all_features, df['seq']), total=len(df), desc="Extracting features"))

# Expand features into separate columns
df_features = pd.DataFrame(features_list)

# Define the feature names
# Assuming you have already predefined feature names or generated them elsewhere

# Concatenate original dataframe with features
df_final = pd.concat([df, df_features], axis=1)

csv_filename = "peptidefeatures.csv"
df_final.to_csv(csv_filename, index=False)

# Calculate the total time taken
total_time = time.time() - start_time
print(f"Total time taken: {total_time} seconds")



Extracting features:  13%|███████▍                                                 | 1681/12801 [06:28<36:00,  5.15it/s]

Error processing sequence KKKR: float division by zero


Extracting features:  14%|███████▉                                                 | 1769/12801 [06:47<32:57,  5.58it/s]

Error processing sequence KKRK: float division by zero


Extracting features:  49%|███████████████████████████▋                             | 6214/12801 [22:45<28:04,  3.91it/s]

Error processing sequence RKKK: float division by zero


Extracting features:  52%|█████████████████████████████▍                           | 6625/12801 [24:11<28:07,  3.66it/s]

Error processing sequence KRRRRRR: float division by zero


Extracting features: 100%|████████████████████████████████████████████████████████| 12801/12801 [46:34<00:00,  4.58it/s]


Total time taken: 2992.001680135727 seconds


In [None]:
import pandas as pd

# Record the start time
start_time = time.time()

# Function to convert non-numerical values to NaN and handle errors gracefully
def ensure_numerical(df):
    # Iterate through each column in the dataframe
    for column in df.columns:
        # Convert non-numerical values to NaN
        df[column] = pd.to_numeric(df[column], errors='coerce')
    return df

# Ensure all columns have numerical values
df_final = ensure_numerical(df_final)

# Relabel the columns appropriately if not already done
# Assuming the column names have been assigned correctly as per the previous code
# Here we can double-check and rename if necessary, or use the existing names

example_peptide = df['seq'].iloc[0]

aa_comp_names = [f"AAComp_{aa}" for aa in pro_example.GetAAComp().keys()]
dp_comp_names = [f"DPComp_{dp}" for dp in pro_example.GetDPComp().keys()]
tp_comp_names = [f"TPComp_{tp}" for tp in pro_example.GetTPComp().keys()]
moreau_broto_names = [f"MoreauBroto_{i}" for i in range(len(pro_example.GetMoreauBrotoAuto().values()))]
moran_names = [f"Moran_{i}" for i in range(len(pro_example.GetMoranAuto().values()))]
geary_names = [f"Geary_{i}" for i in range(len(pro_example.GetGearyAuto().values()))]
qso_names = [f"QSO_{i}" for i in range(len(pro_example.GetQSO().values()))]
physchem_names = ['Molecular_Weight', 'Isoelectric_Point', 'Instability_Index', 'GRAVY']

# Combine all feature names
feature_names = (aa_comp_names + dp_comp_names + tp_comp_names + moreau_broto_names +
                 moran_names + geary_names + qso_names + physchem_names)

# Assign column names to the features dataframe
df_features.columns = feature_names

# Concatenate original dataframe with features
df_final = pd.concat([df, df_features], axis=1)

# Show the dataframe
print(df_final)

# Calculate the total time taken
total_time = time.time() - start_time
print(f"Total time taken: {total_time} seconds")

# Save the dataframe to a file to avoid running the feature extraction again
df_final.to_csv('df_final.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

# Step 1: Initial Model Training and Evaluation
def initial_model_training_evaluation(X_train, X_test, y_train, y_test):
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    initial_accuracy = accuracy_score(y_test, y_pred)
    print("Initial Model Accuracy:", initial_accuracy)
    return clf

# Step 3: Model Training and Evaluation after PCA
def model_training_evaluation_after_pca(X_train, X_test, y_train, y_test, n_components):
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    pca_accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy after PCA:", pca_accuracy)
    return pca, clf

# Step 5: Model Training and Evaluation after Feature Selection
def model_training_evaluation_after_feature_selection(X_train, X_test, y_train, y_test, k_features):
    selector = SelectKBest(f_classif, k=k_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train_selected, y_train)
    y_pred = clf.predict(X_test_selected)
    fs_accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy after Feature Selection:", fs_accuracy)
    return selector, clf

# Step 7: Hyperparameter Tuning
def hyperparameter_tuning(X_train, y_train):
    param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]}
    grid_search = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
    return best_params



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Load your dataset into a DataFrame (assuming 'df' contains your data)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'df' contains your feature data and 'AB' is your target column
X = df.drop(columns=['AB'])
y = df['AB']

# Normalize the feature data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Step 1: Initial Model Training and Evaluation
# Split the normalized data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Train the initial SVM model
initial_model = SVC()
initial_model.fit(X_train, y_train)

# Evaluate the initial model
initial_model_accuracy = accuracy_score(y_test, initial_model.predict(X_test))
print("Initial Model Accuracy:", initial_model_accuracy)

# Step 2: PCA
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Step 3: Model Training and Evaluation after PCA
pca_model = SVC()
pca_model.fit(X_train_pca, y_train)

# Evaluate the model after PCA
pca_model_accuracy = accuracy_score(y_test, pca_model.predict(X_test_pca))
print("Model Accuracy after PCA:", pca_model_accuracy)

# Step 4: Feature Selection
selector = SelectKBest(f_classif, k=10)  # Select top 10 features
X_train_selected = selector.fit_transform(X_train_pca, y_train)
X_test_selected = selector.transform(X_test_pca)

# Step 5: Model Training and Evaluation after Feature Selection
selected_model = SVC()
selected_model.fit(X_train_selected, y_train)

# Evaluate the model after feature selection
selected_model_accuracy = accuracy_score(y_test, selected_model.predict(X_test_selected))
print("Model Accuracy after Feature Selection:", selected_model_accuracy)

# Step 6: Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)

# Step 7: Model Training and Evaluation after Hyperparameter Tuning
best_model = grid_search.best_estimator_
best_model.fit(X_train_selected, y_train)

# Evaluate the best model
best_model_accuracy = accuracy_score(y_test, best_model.predict(X_test_selected))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)


                                                 seq  AAComp_A  AAComp_R  \
0                               ACDEFGHIKLMNPQRSTVWY     5.000     5.000   
1  MRTLLVFLLLAIFVAVLIGNVQVEAACKEYWECGAFLFCIEGICVPMIG    10.204     2.041   

   AAComp_N  AAComp_D  AAComp_C  AAComp_E  AAComp_Q  AAComp_G  AAComp_H  ...  \
0     5.000       5.0     5.000     5.000     5.000     5.000       5.0  ...   
1     2.041       0.0     8.163     8.163     2.041     8.163       0.0  ...   

     QSO_94    QSO_95    QSO_96    QSO_97    QSO_98    QSO_99  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1  0.025391  0.023184  0.020166  0.023526  0.019069  0.013956   

   Molecular_Weight  Isoelectric_Point  Instability_Index     GRAVY  
0         2395.7134           6.784552          84.740000 -0.490000  
1         5408.5940           4.493088          27.159184  1.485714  

[2 rows x 9245 columns]
Total time taken: 1.2066779136657715 seconds


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

# Load your dataset into a DataFrame (assuming 'df' contains your data)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'df' contains your feature data and 'AB' is your target column
X = df.drop(columns=['AB'])
y = df['AB']

# Normalize the feature data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

# Step 3: Train the initial SVM model and evaluate its performance
initial_model = SVC()
initial_model.fit(X_train, y_train)
initial_model_accuracy = accuracy_score(y_test, initial_model.predict(X_test))
print("Initial Model Accuracy:", initial_model_accuracy)

# Step 4: Apply PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Step 5: Train and evaluate the model after PCA
pca_model = SVC()
pca_model.fit(X_train_pca, y_train)
pca_model_accuracy = accuracy_score(y_test, pca_model.predict(X_test_pca))
print("Model Accuracy after PCA:", pca_model_accuracy)

# Step 6: Perform feature selection on the data after PCA
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_pca, y_train)
X_test_selected = selector.transform(X_test_pca)

# Step 7: Train and evaluate the model after feature selection
selected_model = SVC()
selected_model.fit(X_train_selected, y_train)
selected_model_accuracy = accuracy_score(y_test, selected_model.predict(X_test_selected))
print("Model Accuracy after Feature Selection:", selected_model_accuracy)

# Step 8: Hyperparameter tuning
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)
best_model = grid_search.best_estimator_

# Step 9: Final model evaluation
best_model_accuracy = accuracy_score(y_test, best_model.predict(X_test_selected))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score

# Load your dataset into a DataFrame (assuming 'df' contains your data)
# df = pd.read_csv('your_dataset.csv')

# Assuming 'df' contains your feature data and 'AB' is your target column
X = df.drop(columns=['AB'])
y = df['AB']

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Define preprocessing steps
preprocessor = Pipeline([
    ('scaler', MinMaxScaler()),
    ('pca', PCA(n_components=0.95)),
    ('selector', SelectKBest(f_classif, k=10))
])

# Step 3: Initial Model Training and Evaluation
initial_model = SVC()
initial_model.fit(X_train, y_train)
initial_model_accuracy = accuracy_score(y_test, initial_model.predict(X_test))
print("Initial Model Accuracy:", initial_model_accuracy)

# Step 4: Preprocess the data and evaluate the model
X_train_preprocessed = preprocessor.fit_transform(X_train, y_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Step 5: Model Training and Evaluation after Preprocessing
model = SVC()
model.fit(X_train_preprocessed, y_train)
preprocessed_model_accuracy = accuracy_score(y_test, model.predict(X_test_preprocessed))
print("Model Accuracy after Preprocessing:", preprocessed_model_accuracy)

# Step 6: Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_preprocessed, y_train)
best_model_accuracy = accuracy_score(y_test, grid_search.predict(X_test_preprocessed))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

# Step 1: Initial Model Training and Evaluation
def initial_model_training_evaluation(X_train, X_test, y_train, y_test):
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    initial_accuracy = accuracy_score(y_test, y_pred)
    print("Initial Model Accuracy:", initial_accuracy)
    return clf

# Step 3: Model Training and Evaluation after PCA
def model_training_evaluation_after_pca(X_train, X_test, y_train, y_test, n_components):
    pca = PCA(n_components=n_components, random_state=42)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    pca_accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy after PCA:", pca_accuracy)
    return pca, clf

# Step 5: Model Training and Evaluation after Feature Selection
def model_training_evaluation_after_feature_selection(X_train, X_test, y_train, y_test, k_features):
    selector = SelectKBest(f_classif, k=k_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    clf = SVC(kernel='rbf', random_state=42)
    clf.fit(X_train_selected, y_train)
    y_pred = clf.predict(X_test_selected)
    fs_accuracy = accuracy_score(y_test, y_pred)
    print("Model Accuracy after Feature Selection:", fs_accuracy)
    return selector, clf

# Step 7: Hyperparameter Tuning
def hyperparameter_tuning(X_train, y_train):
    param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1]}
    grid_search = GridSearchCV(SVC(kernel='rbf', random_state=42), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
    return best_params

# Load your data into X (features) and y (target)
# Replace X and y with your actual data
X = ...
y = ...

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 1: Initial Model Training and Evaluation
initial_model = initial_model_training_evaluation(X_train, X_test, y_train, y_test)

# Step 2: PCA
pca, _ = model_training_evaluation_after_pca(X_train, X_test, y_train, y_test, n_components=10)

# Step 3: Model Training and Evaluation after PCA
_, _ = model_training_evaluation_after_pca(X_train, X_test, y_train, y_test, n_components=10)

# Step 4: Feature Selection
selector, _ = model_training_evaluation_after_feature_selection(X_train, X_test, y_train, y_test, k_features=5)

# Step 5: Model Training and Evaluation after Feature Selection
_, _ = model_training_evaluation_after_feature_selection(X_train, X_test, y_train, y_test, k_features=5)

# Step 6: Hyperparameter Tuning
best_params = hyperparameter_tuning(X_train, y_train)

# Step 7: Model Training and Evaluation after Hyperparameter Tuning
clf = SVC(kernel='rbf', **best_params, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy after Hyperparameter Tuning:", tuned_accuracy)
