In [2]:
import sqlite3
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import time
import numpy as np
from tqdm import tqdm
from multiprocessing import Pool, cpu_count

# Connect to the database and read the data
con = sqlite3.connect("../unified_CD2.db")
df = pd.read_sql_query("SELECT * FROM prod_desc", con)
df["AB"] = df["AB"].apply(lambda x: int.from_bytes(x, "little"))

df


Unnamed: 0,index,id,name,AB,description,OX,source,seq,valid,MaxAbsEStateIndex,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,0.0,ADAM_2177,InverPep_ADAM_2177,1,,alien,InverPep.fasta,GLFNVFKGLKTAGKHVAGSLLNQLKCKVSGGC,yes,14.846797,...,0,0,0,0,0,0,0,0,5,0
1,,P85444,PPIA_PENGL,0,Peptidyl-prolyl cis-trans isomerase (Fragment),Penicillium glabrum OX=69773,uniprot_swissprot.fasta,KFADENFQLKH,yes,14.574620,...,0,0,0,0,0,0,0,0,2,0
2,,P85445,ATPD_PENGL,0,"ATP synthase subunit delta, mitochondrial (Fra...",Penicillium glabrum OX=69773 GN=atp16,uniprot_swissprot.fasta,KIANGSGSEQDIAEAKI,yes,14.109537,...,0,0,0,0,0,0,0,0,2,0
3,,P85448,FAR1_LUCCU,0,FMRFamide-1,Lucilia cuprina OX=7375,uniprot_swissprot.fasta,SVQDNFIRF,yes,14.458228,...,0,0,0,0,0,0,0,0,1,0
4,,P85450,FAR3_LUCCU,0,FMRFamide-3,Lucilia cuprina OX=7375,uniprot_swissprot.fasta,SANTKNDFMRF,yes,14.493044,...,1,0,0,0,0,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12796,,A0A286YD83,SIM43_MOUSE,0,Small integral membrane protein 43,Mus musculus OX=10090 GN=Smim43,uniprot_swissprot.fasta,MEWKLNLLLYLALFFFLLFLLFLLLFVVIKQLKNSVANTAGTLQPG...,yes,16.078441,...,1,0,0,0,0,0,0,0,5,0
12797,,A0A2I8B346,AMP1_PARCM,0,Paralithocin 1,Paralithodes camtschaticus OX=6741,uniprot_swissprot.fasta,MGPMKVLLVLLVVMVAAPHIADAWQQPSCSSICDYSCGKSACISYS...,yes,15.339288,...,3,0,0,0,0,0,0,0,5,0
12798,,A0A2L1DGG0,HSTX1_HAESL,0,Peptide HSTX-I,Haemadipsa sylvestris OX=13555,uniprot_swissprot.fasta,MRTLLVFLLLAIFVAVLIGNVQVEAACKEYWECGAFLFCIEGICVPMIG,yes,15.662066,...,2,0,0,0,0,0,0,0,2,0
12799,,A0A348G5W0,TX14A_ODOMO,0,U-poneritoxin(01)-Om4a,Odontomachus monticola OX=613454,uniprot_swissprot.fasta,MKPSSLTLAFLVVFMMAIMYNSVQAEALADADAEAFAEAGVKELFG...,yes,15.476384,...,4,0,0,0,0,0,0,0,7,0


In [None]:
# Define feature extraction function
def extract_all_features(peptide):
    features = {}
    try:
        pro = PyPro.GetProDes(peptide)
        
        # Extracting features
        for prefix, descriptor_func in [('AAComp_', pro.GetAAComp), 
                                        ('DPComp_', pro.GetDPComp), 
                                        ('TPComp_', pro.GetTPComp), 
                                        ('MoreauBroto_', pro.GetMoreauBrotoAuto), 
                                        ('Moran_', pro.GetMoranAuto), 
                                        ('Geary_', pro.GetGearyAuto), 
                                        ('QSO_', pro.GetQSO)]:
            descriptor = descriptor_func()
            if descriptor:
                features.update({f'{prefix}{k}': v for k, v in descriptor.items()})
        
        # Additional physicochemical properties
        analysed_seq = ProteinAnalysis(peptide)
        features.update({
            'MolecularWeight': analysed_seq.molecular_weight(),
            'IsoelectricPoint': analysed_seq.isoelectric_point(),
            'InstabilityIndex': analysed_seq.instability_index(),
            'Gravy': analysed_seq.gravy()
        })
    except Exception as e:
        print(f"Error processing sequence {peptide}: {e}")
        # Skip to next sequence
        return None
    
    return features



# Use multiprocessing for feature extraction
num_processes = cpu_count()

def process_chunk(chunk):
    with Pool(processes=num_processes) as pool:
        features_list = list(tqdm(pool.imap(extract_all_features, chunk), total=len(chunk), desc="Extracting features"))
    return pd.DataFrame(features_list)

# Extract features for all sequences
df_features = process_chunk(df['seq'])

# Combine original DataFrame with features DataFrame
df_final = pd.concat([df, df_features], axis=1)

# Save to CSV
csv_filename = "peptidefeatures.csv"
df_final.to_csv(csv_filename, index=False)

df_final

Extracting features:  13%|███████▍                                                 | 1681/12801 [09:45<47:28,  3.90it/s]

Error processing sequence KKKR: float division by zero


Extracting features:  14%|███████▌                                               | 1767/12801 [10:31<2:42:32,  1.13it/s]

Error processing sequence KKRK: float division by zero


Extracting features:  34%|██████████████████▌                                   | 4387/12801 [25:52<12:02:27,  5.15s/it]IOStream.flush timed out


In [1]:
df = pd.read-csv("peptidefeatures_ohne_namen.csv")
df



# Define feature extraction function
def extract_all_features(peptide):
    features = {}
    try:
        pro = PyPro.GetProDes(peptide)
        
        # Extracting features
        for prefix, descriptor_func in [('AAComp_AminoAcidComposition_', pro.GetAAComp), 
                                        ('DPComp_DipeptideComposition_', pro.GetDPComp), 
                                        ('TPComp_TripeptideComposition_', pro.GetTPComp), 
                                        ('MoreauBroto_Auto_', pro.GetMoreauBrotoAuto), 
                                        ('Moran_Auto_', pro.GetMoranAuto), 
                                        ('Geary_Auto_', pro.GetGearyAuto), 
                                        ('QSO_', pro.GetQSO)]:
            descriptor = descriptor_func()
            if descriptor:
                features.update({f'{prefix}{k}': v for k, v in descriptor.items()})
        
        # Additional physicochemical properties
        analysed_seq = ProteinAnalysis(peptide)
        features.update({
            'MolecularWeight': analysed_seq.molecular_weight(),
            'IsoelectricPoint': analysed_seq.isoelectric_point(),
            'InstabilityIndex': analysed_seq.instability_index(),
            'Gravy': analysed_seq.gravy()
        })
    except Exception as e:
        print(f"Error processing sequence {peptide}: {e}")
        # Return NaN for all features in case of error
        features = {f'{prefix}{k}': float('NaN') for prefix in ['AAComp_AminoAcidComposition_', 
                                                                 'DPComp_DipeptideComposition_', 
                                                                 'TPComp_TripeptideComposition_', 
                                                                 'MoreauBroto_Auto_', 
                                                                 'Moran_Auto_', 
                                                                 'Geary_Auto_', 
                                                                 'QSO_'] 
                    for k in range(1, 21)}  # Assuming amino acids are represented by integers 1 to 20
    
    return features


# Single peptide sequence
peptide_sequence = "MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGTRDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEECLFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILFLPLPV"

# Apply feature extraction to the single peptide sequence
single_sequence_df = pd.DataFrame([peptide_sequence], columns=['seq'])

# Use multiprocessing for feature extraction
num_processes = cpu_count()
chunk_size = 1

def process_chunk(chunk):
    with Pool(processes=num_processes) as pool:
        features_list = list(tqdm(pool.imap(extract_all_features, chunk), total=len(chunk), desc="Extracting features"))
    return pd.DataFrame(features_list)

# Extract features for the single sequence
df_features = process_chunk(single_sequence_df['seq'])

# Combine original DataFrame with features DataFrame
df_final = pd.concat([single_sequence_df, df_features], axis=1)

# Print the resulting DataFrame
print("Resulting DataFrame after feature extraction:")

df_final

NameError: name 'pd' is not defined

In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV

# Load your dataset into a DataFrame (assuming 'df' contains your data)
# df = pd.read_csv('your_dataset.csv')
df = pd.read_csv("peptidefeatures_ohne_name.csv")
df
# Assuming 'df' contains your feature data and 'AB' is your target column
X = df.drop(columns=['AB'])
y = df['AB']

# Step 1: Normalize the data
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Step 2: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)










In [None]:
# Step 3: Initial Model Training and Evaluation
initial_model = SVC()
initial_model.fit(X_train, y_train)
initial_model_accuracy = accuracy_score(y_test, initial_model.predict(X_test))
print("Initial Model Accuracy:", initial_model_accuracy)


In [None]:
# Step 4: PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [None]:
# Step 5: Model Training and Evaluation after PCA
pca_model = SVC()
pca_model.fit(X_train_pca, y_train)
pca_model_accuracy = accuracy_score(y_test, pca_model.predict(X_test_pca))
print("Model Accuracy after PCA:", pca_model_accuracy)


In [None]:
# Step 6: Feature Selection
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_pca, y_train)
X_test_selected = selector.transform(X_test_pca)


In [None]:
# Step 7: Model Training and Evaluation after Feature Selection
selected_model = SVC()
selected_model.fit(X_train_selected, y_train)
selected_model_accuracy = accuracy_score(y_test, selected_model.predict(X_test_selected))
print("Model Accuracy after Feature Selection:", selected_model_accuracy)

In [None]:
# Step 8: Hyperparameter Tuning
param_grid = {'C': [0.1, 1, 10], 'gamma': [0.1, 0.01, 0.001], 'kernel': ['rbf']}
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train_selected, y_train)
best_model_accuracy = accuracy_score(y_test, grid_search.predict(X_test_selected))
print("Best Model Accuracy after Hyperparameter Tuning:", best_model_accuracy)
