In [2]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.80-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: biopython
Successfully installed biopython-1.80


In [150]:
from Bio import SeqIO
import pandas as pd
import re

In [243]:
def fasta2df(path):
    seq_ids = []
    seq_annos = []
    seq_sps = []
    seqs = []

    for record in SeqIO.parse(path, 'fasta'):
        seq_id   = record.id
        seq_desc = record.description
        seq      = record.seq

        seq_desc = re.split('\[(.*?)\]', seq_desc)[:-1]
        seq_sp   = seq_desc[1]
        seq_anno = seq_desc[0].split()
        seq_anno = ' '.join(seq_anno[1:])
        seq_anno = re.sub(r'MULTISPECIES: ', '', seq_anno)

        seq_ids.append(seq_id)
        seq_annos.append(seq_anno)
        seq_sps.append(seq_sp)
        seqs.append(str(seq))

        
    seq_ids = pd.Series(seq_ids, name='id')
    seq_annos = pd.Series(seq_annos, name='annotation')
    seq_sps = pd.Series(seq_sps, name='species')
    seqs = pd.Series(seqs, name='sequence')

    df = pd.DataFrame([seq_ids, seq_annos, seq_sps, seqs]).T.set_index('id')
    
    return df

In [244]:
path = 'fasta_sample.txt' # blast検索の結果をfasta形式でexportしたファイル

In [245]:
df = fasta2df(path)

In [246]:
df

Unnamed: 0_level_0,annotation,species,sequence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RYF82892.1,FAD-binding oxidoreductase,Comamonadaceae bacterium,MPPTGHTGFAYPLAQVDWAQVQAELQGLDVITRLPQRRQMSKDFFW...
WP_084217151.1,FAD-binding oxidoreductase,Xenophilus azovorans,MPISIPAAPPTGHAGFAYDLSRVDWQQVQADLQGLDVITRLPQRRQ...
WP_225781474.1,FAD-binding oxidoreductase,Xenophilus sp. Marseille-Q4582,MNEQVIDSAAAGAAIPAMPPTGHAGFAHPLAHVDWAQVQAELQGLD...
VWX61512.1,FAD/FMN-containing dehydrogenase,Burkholderiales bacterium 8X,MSAEPTIARPLPAQVGREAFPYDLDSVDWNAVQADLRGLNVLTRPA...
RSZ38407.1,FAD-binding oxidoreductase,Variovorax sp. 553,MKARCAVNARTIVSAVDWDAVCADLRGLNLVTAPAQRKQLSKDFYW...
WP_240651458.1,FAD-binding oxidoreductase,unclassified Variovorax,MNARTIVSAVDWDAVCADLRGLNLVTAPAQRKQLSKDFYWYSPILT...
WP_097197667.1,FAD-binding oxidoreductase,Variovorax sp. YR752,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILS...
SDY98529.1,FAD/FMN-containing dehydrogenase,Variovorax sp. YR266,MPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILTAQLAAC...
WP_093055342.1,FAD-binding oxidoreductase,Variovorax sp. YR634,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILS...
WP_093174372.1,FAD-binding oxidoreductase,Variovorax sp. YR266,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILT...


In [253]:
que = 'FAD-binding oxidoreductase'
df[df['annotation'] == que] # 完全一致

Unnamed: 0_level_0,annotation,species,sequence
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RYF82892.1,FAD-binding oxidoreductase,Comamonadaceae bacterium,MPPTGHTGFAYPLAQVDWAQVQAELQGLDVITRLPQRRQMSKDFFW...
WP_084217151.1,FAD-binding oxidoreductase,Xenophilus azovorans,MPISIPAAPPTGHAGFAYDLSRVDWQQVQADLQGLDVITRLPQRRQ...
WP_225781474.1,FAD-binding oxidoreductase,Xenophilus sp. Marseille-Q4582,MNEQVIDSAAAGAAIPAMPPTGHAGFAHPLAHVDWAQVQAELQGLD...
RSZ38407.1,FAD-binding oxidoreductase,Variovorax sp. 553,MKARCAVNARTIVSAVDWDAVCADLRGLNLVTAPAQRKQLSKDFYW...
WP_240651458.1,FAD-binding oxidoreductase,unclassified Variovorax,MNARTIVSAVDWDAVCADLRGLNLVTAPAQRKQLSKDFYWYSPILT...
WP_097197667.1,FAD-binding oxidoreductase,Variovorax sp. YR752,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILS...
WP_093055342.1,FAD-binding oxidoreductase,Variovorax sp. YR634,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILS...
WP_093174372.1,FAD-binding oxidoreductase,Variovorax sp. YR266,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILT...
WP_093076222.1,FAD-binding oxidoreductase,Variovorax sp. OV084,MNARTIVPAVDWDAVRADLRGLNLITAPAQRKQLSKDFYWYSPILT...
WP_093196004.1,FAD-binding oxidoreductase,Variovorax sp. YR750,MNARTPTPAPAVDWDAVREDLRGLNVITAPGQRKQLSKDFYWYSPI...


In [258]:
set(df['annotation'])

{'FAD-binding oxidoreductase',
 'FAD-binding protein',
 'FAD/FMN-containing dehydrogenase',
 'putative FAD-linked oxidoreductase'}