In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [8]:
from Bio import SeqIO

fasta_file = "/scratch/project/open-35-8/pimenol1/ProteinTTT/ProteinTTT/data/marts_db/MartsDbEnzymes.fasta"

sequences = list(SeqIO.parse(fasta_file, "fasta"))

# Build list of records first, then create DataFrame (efficient & pandas 2.0+ compatible)
records = []
for record in sequences:
    print(f"ID: {record.id}")
    print(f"Description: {record.description}")
    print(f"Sequence: {record.seq[:50]}...")
    print()
    
    records.append({
        "id": record.id,
        "sequence": str(record.seq),
        "description": record.description,
    })

df_pd = pd.DataFrame(records)



ID: marts_E00000|Chloroplast
Description: marts_E00000|Chloroplast monoterpene synthase| species=Hedychium coronarium kingdom=Plantae
Sequence: MSVSLSFAASATFGFRGGLGGFSRPAAAIKQWRCLPRIQCHSAEQSQSPL...

ID: marts_E00001|Xeniaphyllene
Description: marts_E00001|Xeniaphyllene synthase XsTC-1| species=Xenia sp. kingdom=Animalia (Coral)
Sequence: MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLADEALV...

ID: marts_E00002|Diterpene
Description: marts_E00002|Diterpene synthase TPS15| species=Plectranthus barbatus (C. forskohlii) kingdom=Plantae
Sequence: SLACVVALKSWNVHPHKTDKGISFIKKNMFRIDEENLEHMPIGFEVALPS...

ID: marts_E00003|Diterpene
Description: marts_E00003|Diterpene synthase TPS4| species=Plectranthus barbatus (C. forskohlii) kingdom=Plantae
Sequence: MSITINLRVIAFPGHGVQSRQGIFAVMEFPRNKNTFKSSFAVKCSLSTPT...

ID: marts_E00004|Diterpene
Description: marts_E00004|Diterpene synthase TPS1| species=Plectranthus barbatus (C. forskohlii) kingdom=Plantae
Sequence: MGSLSTMNLNHSPMSYSGILPSSSAKAKLLLPGCFSISAWMNNG

In [10]:
df_pd['length'] = df_pd['sequence'].apply(len)
df_pd

Unnamed: 0,id,sequence,description,length
0,marts_E00000|Chloroplast,MSVSLSFAASATFGFRGGLGGFSRPAAAIKQWRCLPRIQCHSAEQS...,marts_E00000|Chloroplast monoterpene synthase|...,593
1,marts_E00001|Xeniaphyllene,MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLAD...,marts_E00001|Xeniaphyllene synthase XsTC-1| sp...,409
2,marts_E00002|Diterpene,SLACVVALKSWNVHPHKTDKGISFIKKNMFRIDEENLEHMPIGFEV...,marts_E00002|Diterpene synthase TPS15| species...,637
3,marts_E00003|Diterpene,MSITINLRVIAFPGHGVQSRQGIFAVMEFPRNKNTFKSSFAVKCSL...,marts_E00003|Diterpene synthase TPS4| species=...,587
4,marts_E00004|Diterpene,MGSLSTMNLNHSPMSYSGILPSSSAKAKLLLPGCFSISAWMNNGKN...,marts_E00004|Diterpene synthase TPS1| species=...,786
...,...,...,...,...
1413,marts_E01440|Cubebene,MVRDMNSAGAGEVANAQFPEFPPALFAGRTNEQMMAEINSLKPPKF...,marts_E01440|Cubebene synthase | species=Lopho...,428
1414,marts_E01441|β-elemene/α-selinene,MAAAEAIPAGTSAFSSSTDNEFVKTFRPPLLDSSYPLNIHPKFSSS...,marts_E01441|β-elemene/α-selinene synthase LbM...,387
1415,marts_E01442|Cubebol,MPISSISGSWTLVFKSYVRQLVKSLYTAKCTEVQKQDLSVAQNEPV...,marts_E01442|Cubebol synthase ChTPS6| species=...,400
1416,marts_E01443|Multifuncitonal,MSSQISSCPPTQHSSSEAEKTELIRHTTTFHPSIWGDRFITYTCDN...,marts_E01443|Multifuncitonal TPS NnTPS4| speci...,565


In [12]:
df_pd['num_id'] = df_pd['id'].str.split('|').str[0].str.split("_").str[1]

In [35]:
df_pd.to_csv('/scratch/project/open-35-8/pimenol1/ProteinTTT/ProteinTTT/data/marts_db/summary.csv', index=False)

In [34]:
df_pd

Unnamed: 0,id,sequence,description,sequence_length,id_str
0,E00000,MSVSLSFAASATFGFRGGLGGFSRPAAAIKQWRCLPRIQCHSAEQS...,marts_E00000|Chloroplast monoterpene synthase|...,593,marts_E00000|Chloroplast
1,E00001,MSEKNVVRIPMKWGRIEREILTQNTIPELVDTNRLISWVKECNLAD...,marts_E00001|Xeniaphyllene synthase XsTC-1| sp...,409,marts_E00001|Xeniaphyllene
2,E00002,SLACVVALKSWNVHPHKTDKGISFIKKNMFRIDEENLEHMPIGFEV...,marts_E00002|Diterpene synthase TPS15| species...,637,marts_E00002|Diterpene
3,E00003,MSITINLRVIAFPGHGVQSRQGIFAVMEFPRNKNTFKSSFAVKCSL...,marts_E00003|Diterpene synthase TPS4| species=...,587,marts_E00003|Diterpene
4,E00004,MGSLSTMNLNHSPMSYSGILPSSSAKAKLLLPGCFSISAWMNNGKN...,marts_E00004|Diterpene synthase TPS1| species=...,786,marts_E00004|Diterpene
...,...,...,...,...,...
1413,E01440,MVRDMNSAGAGEVANAQFPEFPPALFAGRTNEQMMAEINSLKPPKF...,marts_E01440|Cubebene synthase | species=Lopho...,428,marts_E01440|Cubebene
1414,E01441,MAAAEAIPAGTSAFSSSTDNEFVKTFRPPLLDSSYPLNIHPKFSSS...,marts_E01441|β-elemene/α-selinene synthase LbM...,387,marts_E01441|β-elemene/α-selinene
1415,E01442,MPISSISGSWTLVFKSYVRQLVKSLYTAKCTEVQKQDLSVAQNEPV...,marts_E01442|Cubebol synthase ChTPS6| species=...,400,marts_E01442|Cubebol
1416,E01443,MSSQISSCPPTQHSSSEAEKTELIRHTTTFHPSIWGDRFITYTCDN...,marts_E01443|Multifuncitonal TPS NnTPS4| speci...,565,marts_E01443|Multifuncitonal
