In [6]:
import sqlite3
import pandas as pd
con = sqlite3.connect("../unified_CD2.db")
cur = con.cursor()

df = pd.read_sql_query("SELECT AB, seq FROM prod_desc", con)
df["AB"] = df["AB"].apply(lambda x:int.from_bytes(x,"little"))
df

Unnamed: 0,AB,seq
0,1,GLFNVFKGLKTAGKHVAGSLLNQLKCKVSGGC
1,0,KFADENFQLKH
2,0,KIANGSGSEQDIAEAKI
3,0,SVQDNFIRF
4,0,SANTKNDFMRF
...,...,...
12796,0,MEWKLNLLLYLALFFFLLFLLFLLLFVVIKQLKNSVANTAGTLQPG...
12797,0,MGPMKVLLVLLVVMVAAPHIADAWQQPSCSSICDYSCGKSACISYS...
12798,0,MRTLLVFLLLAIFVAVLIGNVQVEAACKEYWECGAFLFCIEGICVPMIG
12799,0,MKPSSLTLAFLVVFMMAIMYNSVQAEALADADAEAFAEAGVKELFG...


In [8]:
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis
import time

# Record the start time
start_time = time.time()

# Function to extract all features and return a DataFrame
def extract_all_features(peptide):
    try:
        pro = PyPro.GetProDes(peptide)
        
        # Amino acid composition
        aa_comp = pro.GetAAComp()
        aa_comp_names = [f"AAComp_{aa}" for aa in aa_comp.keys()]
        aa_comp_values = list(aa_comp.values())
        
        # Dipeptide composition
        dp_comp = pro.GetDPComp()
        dp_comp_names = [f"DPComp_{dp}" for dp in dp_comp.keys()]
        dp_comp_values = list(dp_comp.values())
        
        # Tripeptide composition
        tp_comp = pro.GetTPComp()
        tp_comp_names = [f"TPComp_{tp}" for tp in tp_comp.keys()]
        tp_comp_values = list(tp_comp.values())
        
        # Moreau-Broto autocorrelation descriptors
        moreau_broto = pro.GetMoreauBrotoAuto()
        moreau_broto_names = [f"MoreauBroto_{i}" for i in range(len(moreau_broto))]
        moreau_broto_values = list(moreau_broto.values())
        
        # Moran autocorrelation descriptors
        moran = pro.GetMoranAuto()
        moran_names = [f"Moran_{i}" for i in range(len(moran))]
        moran_values = list(moran.values())
        
        # Geary autocorrelation descriptors
        geary = pro.GetGearyAuto()
        geary_names = [f"Geary_{i}" for i in range(len(geary))]
        geary_values = list(geary.values())
        
        # Quasi-sequence order descriptors
        qso = pro.GetQSO()
        qso_names = [f"QSO_{i}" for i in range(len(qso))]
        qso_values = list(qso.values())
        
        # Calculate additional physicochemical properties using Bio.SeqUtils.ProtParam
        analysed_seq = ProteinAnalysis(peptide)
        physchem_names = ['Molecular_Weight', 'Isoelectric_Point', 'Instability_Index', 'GRAVY']
        physchem_values = [
            analysed_seq.molecular_weight(),  # Molecular weight
            analysed_seq.isoelectric_point(),  # Isoelectric point (pI)
            analysed_seq.instability_index(),  # Instability index
            analysed_seq.gravy(),  # Hydrophobicity (GRAVY)
        ]
        
        # Combine all feature names and values
        feature_names = (aa_comp_names + dp_comp_names + tp_comp_names + moreau_broto_names +
                         moran_names + geary_names + qso_names + physchem_names)
        feature_values = (aa_comp_values + dp_comp_values + tp_comp_values + moreau_broto_values +
                          moran_values + geary_values + qso_values + physchem_values)
        
        # Create a DataFrame with the feature names and values
        features_df = pd.DataFrame([feature_values], columns=feature_names)
        
        return features_df
    
    except ZeroDivisionError:
        # Handle division by zero gracefully
        print(f"Error processing sequence {peptide}: Division by zero")
        return pd.DataFrame()
    except AttributeError:
        # Handle attribute error gracefully
        print(f"Error processing sequence {peptide}: Empty sequence")
        return pd.DataFrame()
    except Exception as e:
        # Handle other exceptions gracefully
        print(f"Error processing sequence {peptide}: {e}")
        return pd.DataFrame()

"""

# Example usage
# Apply function to each peptide sequence and concatenate the results
df_features_list = df['seq'].apply(extract_all_features)
df_features = pd.concat(df_features_list.to_list(), ignore_index=True)

# Concatenate original dataframe with features
df_final = pd.concat([df.drop(columns=['seq']), df_features], axis=1)

# Show the first ten columns
print(df_final)

# Calculate the total time taken
total_time = time.time() - start_time
"""

extract_all_features("AALLKK")

Unnamed: 0,AAComp_A,AAComp_R,AAComp_N,AAComp_D,AAComp_C,AAComp_E,AAComp_Q,AAComp_G,AAComp_H,AAComp_I,...,QSO_94,QSO_95,QSO_96,QSO_97,QSO_98,QSO_99,Molecular_Weight,Isoelectric_Point,Instability_Index,GRAVY
0,33.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,642.8309,10.003317,-5.816667,0.566667
