In [14]:
import sqlite3
import pandas as pd
con = sqlite3.connect("../unified_CD2.db")
cur = con.cursor()

df = pd.read_sql_query("SELECT * FROM prod_desc", con)
df["AB"] = df["AB"].apply(lambda x:int.from_bytes(x,"little"))
df = df[["seq", "AB"]]
df

Unnamed: 0,seq,AB
0,GLFNVFKGLKTAGKHVAGSLLNQLKCKVSGGC,1
1,KFADENFQLKH,0
2,KIANGSGSEQDIAEAKI,0
3,SVQDNFIRF,0
4,SANTKNDFMRF,0
...,...,...
12796,MEWKLNLLLYLALFFFLLFLLFLLLFVVIKQLKNSVANTAGTLQPG...,0
12797,MGPMKVLLVLLVVMVAAPHIADAWQQPSCSSICDYSCGKSACISYS...,0
12798,MRTLLVFLLLAIFVAVLIGNVQVEAACKEYWECGAFLFCIEGICVPMIG,0
12799,MKPSSLTLAFLVVFMMAIMYNSVQAEALADADAEAFAEAGVKELFG...,0


In [13]:
import pandas as pd
from propy import PyPro
from Bio.SeqUtils.ProtParam import ProteinAnalysis



# Function to extract all features
def extract_all_features(peptide):
    try:
        # Check if the peptide sequence is empty
        if not peptide:
            return None
        
        pro = PyPro.GetProDes(peptide)
        
        features = []
        
        # Amino acid composition
        features += list(pro.GetAAComp().values())
        
        # Dipeptide composition
        features += list(pro.GetDPComp().values())
        
        # Tripeptide composition
        features += list(pro.GetTPComp().values())
        
        # Moreau-Broto autocorrelation descriptors
        features += list(pro.GetMoreauBrotoAuto().values())
        
        # Moran autocorrelation descriptors
        features += list(pro.GetMoranAuto().values())
        
        # Geary autocorrelation descriptors
        features += list(pro.GetGearyAuto().values())
        
        # Quasi-sequence order descriptors
        features += list(pro.GetQSO().values()) 
        
        
        return features
    
    except ZeroDivisionError:
        # Handle division by zero gracefully
        print(f"Error processing sequence {peptide}: Division by zero")
        return None
    except AttributeError:
        # Handle attribute error gracefully
        print(f"Error processing sequence {peptide}: Empty sequence")
        return None
    except Exception as e:
        # Handle other exceptions gracefully
        print(f"Error processing sequence {peptide}: {e}")
        return None

# Apply function to each peptide sequence
df['Features'] = df['seq'].apply(extract_all_features)

# Filter out sequences where feature extraction failed
df_filtered = df.dropna()

# Expand features into separate columns
df_features = pd.DataFrame(df_filtered['Features'].tolist())

# Concatenate original dataframe with features
df_final = pd.concat([df_filtered.drop(columns=['Features']), df_features], axis=1)

df_final


ModuleNotFoundError: No module named 'propy'