In [2]:
from Bio import SeqIO

def gc_content(sequence):
    """Calculate the GC content of a given sequence."""
    g = sequence.count('G')
    c = sequence.count('C')
    return (g + c) / len(sequence) if len(sequence) > 0 else 0

def main():
    input_file = r"C:\Users\Misha\Desktop\jupyter notebook\genbank\sequence.gb"
    
    records_gc = []
    
    # Чтение записей из файла GenBank
    for record in SeqIO.parse(input_file, "genbank"):
        # Проверяем наличие CDS
        for feature in record.features:
            if feature.type == "CDS" and "translation" in feature.qualifiers:
                translation = feature.qualifiers["translation"][0]
                gc = gc_content(translation)
                records_gc.append((record.id, record.description, gc))
    
    # Сортировка по GC-составу
    records_gc.sort(key=lambda x: x[2])
    
    # Вывод результатов
    for record_id, description, gc in records_gc:
        print(f"{record_id}: {description}, GC = {gc}")

if __name__ == "__main__":
    main()

M87514.1: Brassica oleracea cytochrome b-5 mRNA, complete cds, GC = 0.06716417910447761
KM587010.1: Solanum lycopersicum cultivar Moneymaker phosphatidylinositol phospholipase C 5 (PLC5) mRNA, complete cds, GC = 0.07082630691399663
KM210340.1: Solanum lycopersicum phosphatidylinositol phospholipase C 7 (PLC7) mRNA, complete cds, GC = 0.07912457912457913
U50985.1: Solanum lycopersicum pectin methylesterase PME2.1 mRNA, complete cds, GC = 0.08545454545454545
JF807943.1: Solanum lycopersicum golden 2-like 1 transcription factor (GLK1) mRNA, complete cds, GC = 0.08620689655172414
JN859553.1: Brassica oleracea FAD2-2 gene, complete cds, GC = 0.09114583333333333
JN859552.1: Brassica oleracea FAD2-1 gene, complete cds, GC = 0.09375
JQ231216.1: Brassica oleracea chloroplast ribulose-1,5-bisphosphate carboxylase/oxygenase activase (RCA) mRNA, complete cds; nuclear gene for chloroplast product, GC = 0.1095890410958904
KF462389.1: Solanum lycopersicum cultivar WVA106 enhancer of zeste protein mRN

In [3]:
from Bio import SeqIO

def extract_protein_sequences(genbank_file):
    # Открываем файл GenBank и обрабатываем записи
    with open(genbank_file, "r") as file:
        for record in SeqIO.parse(file, "genbank"):
            # Проходим по всем особенностям записи
            for feature in record.features:
                # Проверяем, является ли особенность кодирующей последовательностью (CDS)
                if feature.type == "CDS":
                    # Получаем информацию о белковой последовательности
                    protein_id = record.id
                    protein_description = record.description
                    coding_location = feature.location
                    protein_sequence = feature.qualifiers.get("translation", [""])[0]
                    
                    # Выводим информацию о белковой последовательности
                    print(f"{protein_id}: {protein_description}")
                    print(f"Coding sequence location = [{coding_location.start}:{coding_location.end}]({coding_location.strand})")
                    print("Translation =")
                    
                    # Форматируем вывод белковой последовательности для удобства чтения
                    for i in range(0, len(protein_sequence), 60):  # Разбиваем на строки по 60 символов
                        print(protein_sequence[i:i+60])
                    print()  # Пустая строка для разделения записей

# Укажите путь к вашему файлу GenBank
genbank_file_path = r"C:\Users\Misha\Desktop\jupyter notebook\genbank\sequence.gb"
extract_protein_sequences(genbank_file_path)

AY571333.1: Brassica oleracea water stress induced protein mRNA, complete cds
Coding sequence location = [59:347](1)
Translation =
MAGIINKIGDALHIGGGNKEDEHKKEEHKKHADEHKSGEHKEGIVDKIKDKIQGGEGHSS
GDHKHDGEKKKKKDKKEKKHHHDGHHSSSSDSDSD

M87514.1: Brassica oleracea cytochrome b-5 mRNA, complete cds
Coding sequence location = [95:500](1)
Translation =
MASEKKVLGFEEVSQHNKTKDCWLIISGKVYDVTPFMDDHPGGDEVLLSSTGKDATNDFE
DVGHSDTARDMMEKYYIGEIDSSTVPATRTYVAPVQPAYNQDKTPEFMIKILQFLVPILI
LGLALVVRQYTKKE

JN859553.1: Brassica oleracea FAD2-2 gene, complete cds
Coding sequence location = [0:1155](1)
Translation =
MGAGGRMQVSPPSSSPETNTLKRVPCETPPFTLGDLKKAIPPHCFKRSIPRSFSYLLFDI
IISSSLYHLSTAYFPLLPHPLPYLAWPLYWACQGCVLTGLWVIAHECGHHAFSDHQLLDD
AVGLVFHSFLLVPYFSWKYSHRRHHSNTGSLERDEVFVPKKKSDVKWYGKYLNNPLGRTV
MLTVQFTLGWPLYLAFNVSGRPYSDGFACHFHPNAPIYNDRERLQIYISDAGVLSVCYGL
YRYAGSRGVASMVCVYGVPLMIVNCFLVLITYLQHTHPSLPHYDSSEWDWLRGALATVDR
DYGILNKVFHNITDTHVAHHLFSTMPHYNAMEATKAIKPILGEYYQFDGTPVVKAMWREA
KECIYVEPDRQGEKKGVFWYNNKL

JN859552.1: Brass